drivers/md/raid5-cache.c

   1 /*
   2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
   3  * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
   4  *
   5  * This program is free software; you can redistribute it and/or modify it
   6  * under the terms and conditions of the GNU General Public License,
   7  * version 2, as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12  * more details.
  13  *
  14  */
  15 #include <linux/kernel.h>
  16 #include <linux/wait.h>
  17 #include <linux/blkdev.h>
  18 #include <linux/slab.h>
  19 #include <linux/raid/md_p.h>
  20 #include <linux/crc32c.h>
  21 #include <linux/random.h>
  22 #include <linux/kthread.h>
  23 #include <linux/types.h>
  24 #include "md.h"
  25 #include "raid5.h"
  26 #include "bitmap.h"
  27 #include "raid5-log.h"
  28
  29 /*
  30  * metadata/data stored in disk with 4k size unit (a block) regardless
  31  * underneath hardware sector size. only works with PAGE_SIZE == 4096
  32  */
  33 #define BLOCK_SECTORS (8)
  34 #define BLOCK_SECTOR_SHIFT (3)
  35
  36 /*
  37  * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
  38  *
  39  * In write through mode, the reclaim runs every log->max_free_space.
  40  * This can prevent the recovery scans for too long
  41  */
  42 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  43 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  44
  45 /* wake up reclaim thread periodically */
  46 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
  47 /* start flush with these full stripes */
  48 #define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
  49 /* reclaim stripes in groups */
  50 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
  51
  52 /*
  53  * We only need 2 bios per I/O unit to make progress, but ensure we
  54  * have a few more available to not get too tight.
  55  */
  56 #define R5L_POOL_SIZE   4
  57
  58 static char *r5c_journal_mode_str[] = {"write-through",
  59                                        "write-back"};
  60 /*
  61  * raid5 cache state machine
  62  *
  63  * With the RAID cache, each stripe works in two phases:
  64  *      - caching phase
  65  *      - writing-out phase
  66  *
  67  * These two phases are controlled by bit STRIPE_R5C_CACHING:
  68  *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
  69  *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
  70  *
  71  * When there is no journal, or the journal is in write-through mode,
  72  * the stripe is always in writing-out phase.
  73  *
  74  * For write-back journal, the stripe is sent to caching phase on write
  75  * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
  76  * the write-out phase by clearing STRIPE_R5C_CACHING.
  77  *
  78  * Stripes in caching phase do not write the raid disks. Instead, all
  79  * writes are committed from the log device. Therefore, a stripe in
  80  * caching phase handles writes as:
  81  *      - write to log device
  82  *      - return IO
  83  *
  84  * Stripes in writing-out phase handle writes as:
  85  *      - calculate parity
  86  *      - write pending data and parity to journal
  87  *      - write data and parity to raid disks
  88  *      - return IO for pending writes
  89  */
  90
  91 struct r5l_log {
  92         struct md_rdev *rdev;
  93
  94         u32 uuid_checksum;
  95
  96         sector_t device_size;           /* log device size, round to
  97                                          * BLOCK_SECTORS */
  98         sector_t max_free_space;        /* reclaim run if free space is at
  99                                          * this size */
 100
 101         sector_t last_checkpoint;       /* log tail. where recovery scan
 102                                          * starts from */
 103         u64 last_cp_seq;                /* log tail sequence */
 104
 105         sector_t log_start;             /* log head. where new data appends */
 106         u64 seq;                        /* log head sequence */
 107
 108         sector_t next_checkpoint;
 109
 110         struct mutex io_mutex;
 111         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
 112
 113         spinlock_t io_list_lock;
 114         struct list_head running_ios;   /* io_units which are still running,
 115                                          * and have not yet been completely
 116                                          * written to the log */
 117         struct list_head io_end_ios;    /* io_units which have been completely
 118                                          * written to the log but not yet written
 119                                          * to the RAID */
 120         struct list_head flushing_ios;  /* io_units which are waiting for log
 121                                          * cache flush */
 122         struct list_head finished_ios;  /* io_units which settle down in log disk */
 123         struct bio flush_bio;
 124
 125         struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
 126
 127         struct kmem_cache *io_kc;
 128         mempool_t *io_pool;
 129         struct bio_set *bs;
 130         mempool_t *meta_pool;
 131
 132         struct md_thread *reclaim_thread;
 133         unsigned long reclaim_target;   /* number of space that need to be
 134                                          * reclaimed.  if it's 0, reclaim spaces
 135                                          * used by io_units which are in
 136                                          * IO_UNIT_STRIPE_END state (eg, reclaim
 137                                          * dones't wait for specific io_unit
 138                                          * switching to IO_UNIT_STRIPE_END
 139                                          * state) */
 140         wait_queue_head_t iounit_wait;
 141
 142         struct list_head no_space_stripes; /* pending stripes, log has no space */
 143         spinlock_t no_space_stripes_lock;
 144
 145         bool need_cache_flush;
 146
 147         /* for r5c_cache */
 148         enum r5c_journal_mode r5c_journal_mode;
 149
 150         /* all stripes in r5cache, in the order of seq at sh->log_start */
 151         struct list_head stripe_in_journal_list;
 152
 153         spinlock_t stripe_in_journal_lock;
 154         atomic_t stripe_in_journal_count;
 155
 156         /* to submit async io_units, to fulfill ordering of flush */
 157         struct work_struct deferred_io_work;
 158         /* to disable write back during in degraded mode */
 159         struct work_struct disable_writeback_work;
 160
 161         /* to for chunk_aligned_read in writeback mode, details below */
 162         spinlock_t tree_lock;
 163         struct radix_tree_root big_stripe_tree;
 164 };
 165
 166 /*
 167  * Enable chunk_aligned_read() with write back cache.
 168  *
 169  * Each chunk may contain more than one stripe (for example, a 256kB
 170  * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
 171  * chunk_aligned_read, these stripes are grouped into one "big_stripe".
 172  * For each big_stripe, we count how many stripes of this big_stripe
 173  * are in the write back cache. These data are tracked in a radix tree
 174  * (big_stripe_tree). We use radix_tree item pointer as the counter.
 175  * r5c_tree_index() is used to calculate keys for the radix tree.
 176  *
 177  * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
 178  * big_stripe of each chunk in the tree. If this big_stripe is in the
 179  * tree, chunk_aligned_read() aborts. This look up is protected by
 180  * rcu_read_lock().
 181  *
 182  * It is necessary to remember whether a stripe is counted in
 183  * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
 184  * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
 185  * two flags are set, the stripe is counted in big_stripe_tree. This
 186  * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
 187  * r5c_try_caching_write(); and moving clear_bit of
 188  * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
 189  * r5c_finish_stripe_write_out().
 190  */
 191
 192 /*
 193  * radix tree requests lowest 2 bits of data pointer to be 2b'00.
 194  * So it is necessary to left shift the counter by 2 bits before using it
 195  * as data pointer of the tree.
 196  */
 197 #define R5C_RADIX_COUNT_SHIFT 2
 198
 199 /*
 200  * calculate key for big_stripe_tree
 201  *
 202  * sect: align_bi->bi_iter.bi_sector or sh->sector
 203  */
 204 static inline sector_t r5c_tree_index(struct r5conf *conf,
 205                                       sector_t sect)
 206 {
 207         sector_t offset;
 208
 209         offset = sector_div(sect, conf->chunk_sectors);
 210         return sect;
 211 }
 212
 213 /*
 214  * an IO range starts from a meta data block and end at the next meta data
 215  * block. The io unit's the meta data block tracks data/parity followed it. io
 216  * unit is written to log disk with normal write, as we always flush log disk
 217  * first and then start move data to raid disks, there is no requirement to
 218  * write io unit with FLUSH/FUA
 219  */
 220 struct r5l_io_unit {
 221         struct r5l_log *log;
 222
 223         struct page *meta_page; /* store meta block */
 224         int meta_offset;        /* current offset in meta_page */
 225
 226         struct bio *current_bio;/* current_bio accepting new data */
 227
 228         atomic_t pending_stripe;/* how many stripes not flushed to raid */
 229         u64 seq;                /* seq number of the metablock */
 230         sector_t log_start;     /* where the io_unit starts */
 231         sector_t log_end;       /* where the io_unit ends */
 232         struct list_head log_sibling; /* log->running_ios */
 233         struct list_head stripe_list; /* stripes added to the io_unit */
 234
 235         int state;
 236         bool need_split_bio;
 237         struct bio *split_bio;
 238
 239         unsigned int has_flush:1;               /* include flush request */
 240         unsigned int has_fua:1;                 /* include fua request */
 241         unsigned int has_null_flush:1;          /* include null flush request */
 242         unsigned int has_flush_payload:1;       /* include flush payload  */
 243         /*
 244          * io isn't sent yet, flush/fua request can only be submitted till it's
 245          * the first IO in running_ios list
 246          */
 247         unsigned int io_deferred:1;
 248
 249         struct bio_list flush_barriers;   /* size == 0 flush bios */
 250 };
 251
 252 /* r5l_io_unit state */
 253 enum r5l_io_unit_state {
 254         IO_UNIT_RUNNING = 0,    /* accepting new IO */
 255         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
 256                                  * don't accepting new bio */
 257         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
 258         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
 259 };
 260
 261 bool r5c_is_writeback(struct r5l_log *log)
 262 {
 263         return (log != NULL &&
 264                 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
 265 }
 266
 267 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 268 {
 269         start += inc;
 270         if (start >= log->device_size)
 271                 start = start - log->device_size;
 272         return start;
 273 }
 274
 275 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
 276                                   sector_t end)
 277 {
 278         if (end >= start)
 279                 return end - start;
 280         else
 281                 return end + log->device_size - start;
 282 }
 283
 284 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 285 {
 286         sector_t used_size;
 287
 288         used_size = r5l_ring_distance(log, log->last_checkpoint,
 289                                         log->log_start);
 290
 291         return log->device_size > used_size + size;
 292 }
 293
 294 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 295                                     enum r5l_io_unit_state state)
 296 {
 297         if (WARN_ON(io->state >= state))
 298                 return;
 299         io->state = state;
 300 }
 301
 302 static void
 303 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 304 {
 305         struct bio *wbi, *wbi2;
 306
 307         wbi = dev->written;
 308         dev->written = NULL;
 309         while (wbi && wbi->bi_iter.bi_sector <
 310                dev->sector + STRIPE_SECTORS) {
 311                 wbi2 = r5_next_bio(wbi, dev->sector);
 312                 md_write_end(conf->mddev);
 313                 bio_endio(wbi);
 314                 wbi = wbi2;
 315         }
 316 }
 317
 318 void r5c_handle_cached_data_endio(struct r5conf *conf,
 319                                   struct stripe_head *sh, int disks)
 320 {
 321         int i;
 322
 323         for (i = sh->disks; i--; ) {
 324                 if (sh->dev[i].written) {
 325                         set_bit(R5_UPTODATE, &sh->dev[i].flags);
 326                         r5c_return_dev_pending_writes(conf, &sh->dev[i]);
 327                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 328                                         STRIPE_SECTORS,
 329                                         !test_bit(STRIPE_DEGRADED, &sh->state),
 330                                         0);
 331                 }
 332         }
 333 }
 334
 335 void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 336
 337 /* Check whether we should flush some stripes to free up stripe cache */
 338 void r5c_check_stripe_cache_usage(struct r5conf *conf)
 339 {
 340         int total_cached;
 341
 342         if (!r5c_is_writeback(conf->log))
 343                 return;
 344
 345         total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
 346                 atomic_read(&conf->r5c_cached_full_stripes);
 347
 348         /*
 349          * The following condition is true for either of the following:
 350          *   - stripe cache pressure high:
 351          *          total_cached > 3/4 min_nr_stripes ||
 352          *          empty_inactive_list_nr > 0
 353          *   - stripe cache pressure moderate:
 354          *          total_cached > 1/2 min_nr_stripes
 355          */
 356         if (total_cached > conf->min_nr_stripes * 1 / 2 ||
 357             atomic_read(&conf->empty_inactive_list_nr) > 0)
 358                 r5l_wake_reclaim(conf->log, 0);
 359 }
 360
 361 /*
 362  * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
 363  * stripes in the cache
 364  */
 365 void r5c_check_cached_full_stripe(struct r5conf *conf)
 366 {
 367         if (!r5c_is_writeback(conf->log))
 368                 return;
 369
 370         /*
 371          * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
 372          * or a full stripe (chunk size / 4k stripes).
 373          */
 374         if (atomic_read(&conf->r5c_cached_full_stripes) >=
 375             min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
 376                 conf->chunk_sectors >> STRIPE_SHIFT))
 377                 r5l_wake_reclaim(conf->log, 0);
 378 }
 379
 380 /*
 381  * Total log space (in sectors) needed to flush all data in cache
 382  *
 383  * To avoid deadlock due to log space, it is necessary to reserve log
 384  * space to flush critical stripes (stripes that occupying log space near
 385  * last_checkpoint). This function helps check how much log space is
 386  * required to flush all cached stripes.
 387  *
 388  * To reduce log space requirements, two mechanisms are used to give cache
 389  * flush higher priorities:
 390  *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
 391  *       stripes ALREADY in journal can be flushed w/o pending writes;
 392  *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
 393  *       can be delayed (r5l_add_no_space_stripe).
 394  *
 395  * In cache flush, the stripe goes through 1 and then 2. For a stripe that
 396  * already passed 1, flushing it requires at most (conf->max_degraded + 1)
 397  * pages of journal space. For stripes that has not passed 1, flushing it
 398  * requires (conf->raid_disks + 1) pages of journal space. There are at
 399  * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
 400  * required to flush all cached stripes (in pages) is:
 401  *
 402  *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
 403  *     (group_cnt + 1) * (raid_disks + 1)
 404  * or
 405  *     (stripe_in_journal_count) * (max_degraded + 1) +
 406  *     (group_cnt + 1) * (raid_disks - max_degraded)
 407  */
 408 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 409 {
 410         struct r5l_log *log = conf->log;
 411
 412         if (!r5c_is_writeback(log))
 413                 return 0;
 414
 415         return BLOCK_SECTORS *
 416                 ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
 417                  (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
 418 }
 419
 420 /*
 421  * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
 422  *
 423  * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
 424  * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
 425  * device is less than 2x of reclaim_required_space.
 426  */
 427 static inline void r5c_update_log_state(struct r5l_log *log)
 428 {
 429         struct r5conf *conf = log->rdev->mddev->private;
 430         sector_t free_space;
 431         sector_t reclaim_space;
 432         bool wake_reclaim = false;
 433
 434         if (!r5c_is_writeback(log))
 435                 return;
 436
 437         free_space = r5l_ring_distance(log, log->log_start,
 438                                        log->last_checkpoint);
 439         reclaim_space = r5c_log_required_to_flush_cache(conf);
 440         if (free_space < 2 * reclaim_space)
 441                 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 442         else {
 443                 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
 444                         wake_reclaim = true;
 445                 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 446         }
 447         if (free_space < 3 * reclaim_space)
 448                 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
 449         else
 450                 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
 451
 452         if (wake_reclaim)
 453                 r5l_wake_reclaim(log, 0);
 454 }
 455
 456 /*
 457  * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
 458  * This function should only be called in write-back mode.
 459  */
 460 void r5c_make_stripe_write_out(struct stripe_head *sh)
 461 {
 462         struct r5conf *conf = sh->raid_conf;
 463         struct r5l_log *log = conf->log;
 464
 465         BUG_ON(!r5c_is_writeback(log));
 466
 467         WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
 468         clear_bit(STRIPE_R5C_CACHING, &sh->state);
 469
 470         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 471                 atomic_inc(&conf->preread_active_stripes);
 472 }
 473
 474 static void r5c_handle_data_cached(struct stripe_head *sh)
 475 {
 476         int i;
 477
 478         for (i = sh->disks; i--; )
 479                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 480                         set_bit(R5_InJournal, &sh->dev[i].flags);
 481                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
 482                 }
 483         clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 484 }
 485
 486 /*
 487  * this journal write must contain full parity,
 488  * it may also contain some data pages
 489  */
 490 static void r5c_handle_parity_cached(struct stripe_head *sh)
 491 {
 492         int i;
 493
 494         for (i = sh->disks; i--; )
 495                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
 496                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
 497 }
 498
 499 /*
 500  * Setting proper flags after writing (or flushing) data and/or parity to the
 501  * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
 502  */
 503 static void r5c_finish_cache_stripe(struct stripe_head *sh)
 504 {
 505         struct r5l_log *log = sh->raid_conf->log;
 506
 507         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 508                 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 509                 /*
 510                  * Set R5_InJournal for parity dev[pd_idx]. This means
 511                  * all data AND parity in the journal. For RAID 6, it is
 512                  * NOT necessary to set the flag for dev[qd_idx], as the
 513                  * two parities are written out together.
 514                  */
 515                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 516         } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 517                 r5c_handle_data_cached(sh);
 518         } else {
 519                 r5c_handle_parity_cached(sh);
 520                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 521         }
 522 }
 523
 524 static void r5l_io_run_stripes(struct r5l_io_unit *io)
 525 {
 526         struct stripe_head *sh, *next;
 527
 528         list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 529                 list_del_init(&sh->log_list);
 530
 531                 r5c_finish_cache_stripe(sh);
 532
 533                 set_bit(STRIPE_HANDLE, &sh->state);
 534                 raid5_release_stripe(sh);
 535         }
 536 }
 537
 538 static void r5l_log_run_stripes(struct r5l_log *log)
 539 {
 540         struct r5l_io_unit *io, *next;
 541
 542         assert_spin_locked(&log->io_list_lock);
 543
 544         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 545                 /* don't change list order */
 546                 if (io->state < IO_UNIT_IO_END)
 547                         break;
 548
 549                 list_move_tail(&io->log_sibling, &log->finished_ios);
 550                 r5l_io_run_stripes(io);
 551         }
 552 }
 553
 554 static void r5l_move_to_end_ios(struct r5l_log *log)
 555 {
 556         struct r5l_io_unit *io, *next;
 557
 558         assert_spin_locked(&log->io_list_lock);
 559
 560         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 561                 /* don't change list order */
 562                 if (io->state < IO_UNIT_IO_END)
 563                         break;
 564                 list_move_tail(&io->log_sibling, &log->io_end_ios);
 565         }
 566 }
 567
 568 static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
 569 static void r5l_log_endio(struct bio *bio)
 570 {
 571         struct r5l_io_unit *io = bio->bi_private;
 572         struct r5l_io_unit *io_deferred;
 573         struct r5l_log *log = io->log;
 574         unsigned long flags;
 575         bool has_null_flush;
 576         bool has_flush_payload;
 577
 578         if (bio->bi_status)
 579                 md_error(log->rdev->mddev, log->rdev);
 580
 581         bio_put(bio);
 582         mempool_free(io->meta_page, log->meta_pool);
 583
 584         spin_lock_irqsave(&log->io_list_lock, flags);
 585         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 586
 587         /*
 588          * if the io doesn't not have null_flush or flush payload,
 589          * it is not safe to access it after releasing io_list_lock.
 590          * Therefore, it is necessary to check the condition with
 591          * the lock held.
 592          */
 593         has_null_flush = io->has_null_flush;
 594         has_flush_payload = io->has_flush_payload;
 595
 596         if (log->need_cache_flush && !list_empty(&io->stripe_list))
 597                 r5l_move_to_end_ios(log);
 598         else
 599                 r5l_log_run_stripes(log);
 600         if (!list_empty(&log->running_ios)) {
 601                 /*
 602                  * FLUSH/FUA io_unit is deferred because of ordering, now we
 603                  * can dispatch it
 604                  */
 605                 io_deferred = list_first_entry(&log->running_ios,
 606                                                struct r5l_io_unit, log_sibling);
 607                 if (io_deferred->io_deferred)
 608                         schedule_work(&log->deferred_io_work);
 609         }
 610
 611         spin_unlock_irqrestore(&log->io_list_lock, flags);
 612
 613         if (log->need_cache_flush)
 614                 md_wakeup_thread(log->rdev->mddev->thread);
 615
 616         /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
 617         if (has_null_flush) {
 618                 struct bio *bi;
 619
 620                 WARN_ON(bio_list_empty(&io->flush_barriers));
 621                 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
 622                         bio_endio(bi);
 623                         if (atomic_dec_and_test(&io->pending_stripe)) {
 624                                 __r5l_stripe_write_finished(io);
 625                                 return;
 626                         }
 627                 }
 628         }
 629         /* decrease pending_stripe for flush payload */
 630         if (has_flush_payload)
 631                 if (atomic_dec_and_test(&io->pending_stripe))
 632                         __r5l_stripe_write_finished(io);
 633 }
 634
 635 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
 636 {
 637         unsigned long flags;
 638
 639         spin_lock_irqsave(&log->io_list_lock, flags);
 640         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 641         spin_unlock_irqrestore(&log->io_list_lock, flags);
 642
 643         /*
 644          * In case of journal device failures, submit_bio will get error
 645          * and calls endio, then active stripes will continue write
 646          * process. Therefore, it is not necessary to check Faulty bit
 647          * of journal device here.
 648          *
 649          * We can't check split_bio after current_bio is submitted. If
 650          * io->split_bio is null, after current_bio is submitted, current_bio
 651          * might already be completed and the io_unit is freed. We submit
 652          * split_bio first to avoid the issue.
 653          */
 654         if (io->split_bio) {
 655                 if (io->has_flush)
 656                         io->split_bio->bi_opf |= REQ_PREFLUSH;
 657                 if (io->has_fua)
 658                         io->split_bio->bi_opf |= REQ_FUA;
 659                 submit_bio(io->split_bio);
 660         }
 661
 662         if (io->has_flush)
 663                 io->current_bio->bi_opf |= REQ_PREFLUSH;
 664         if (io->has_fua)
 665                 io->current_bio->bi_opf |= REQ_FUA;
 666         submit_bio(io->current_bio);
 667 }
 668
 669 /* deferred io_unit will be dispatched here */
 670 static void r5l_submit_io_async(struct work_struct *work)
 671 {
 672         struct r5l_log *log = container_of(work, struct r5l_log,
 673                                            deferred_io_work);
 674         struct r5l_io_unit *io = NULL;
 675         unsigned long flags;
 676
 677         spin_lock_irqsave(&log->io_list_lock, flags);
 678         if (!list_empty(&log->running_ios)) {
 679                 io = list_first_entry(&log->running_ios, struct r5l_io_unit,
 680                                       log_sibling);
 681                 if (!io->io_deferred)
 682                         io = NULL;
 683                 else
 684                         io->io_deferred = 0;
 685         }
 686         spin_unlock_irqrestore(&log->io_list_lock, flags);
 687         if (io)
 688                 r5l_do_submit_io(log, io);
 689 }
 690
 691 static void r5c_disable_writeback_async(struct work_struct *work)
 692 {
 693         struct r5l_log *log = container_of(work, struct r5l_log,
 694                                            disable_writeback_work);
 695         struct mddev *mddev = log->rdev->mddev;
 696
 697         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 698                 return;
 699         pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
 700                 mdname(mddev));
 701
 702         /* wait superblock change before suspend */
 703         wait_event(mddev->sb_wait,
 704                    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
 705
 706         mddev_suspend(mddev);
 707         log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
 708         mddev_resume(mddev);
 709 }
 710
 711 static void r5l_submit_current_io(struct r5l_log *log)
 712 {
 713         struct r5l_io_unit *io = log->current_io;
 714         struct bio *bio;
 715         struct r5l_meta_block *block;
 716         unsigned long flags;
 717         u32 crc;
 718         bool do_submit = true;
 719
 720         if (!io)
 721                 return;
 722
 723         block = page_address(io->meta_page);
 724         block->meta_size = cpu_to_le32(io->meta_offset);
 725         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 726         block->checksum = cpu_to_le32(crc);
 727         bio = io->current_bio;
 728
 729         log->current_io = NULL;
 730         spin_lock_irqsave(&log->io_list_lock, flags);
 731         if (io->has_flush || io->has_fua) {
 732                 if (io != list_first_entry(&log->running_ios,
 733                                            struct r5l_io_unit, log_sibling)) {
 734                         io->io_deferred = 1;
 735                         do_submit = false;
 736                 }
 737         }
 738         spin_unlock_irqrestore(&log->io_list_lock, flags);
 739         if (do_submit)
 740                 r5l_do_submit_io(log, io);
 741 }
 742
 743 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 744 {
 745         struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 746
 747         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 748         bio_set_dev(bio, log->rdev->bdev);
 749         bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 750
 751         return bio;
 752 }
 753
 754 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 755 {
 756         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 757
 758         r5c_update_log_state(log);
 759         /*
 760          * If we filled up the log device start from the beginning again,
 761          * which will require a new bio.
 762          *
 763          * Note: for this to work properly the log size needs to me a multiple
 764          * of BLOCK_SECTORS.
 765          */
 766         if (log->log_start == 0)
 767                 io->need_split_bio = true;
 768
 769         io->log_end = log->log_start;
 770 }
 771
 772 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 773 {
 774         struct r5l_io_unit *io;
 775         struct r5l_meta_block *block;
 776
 777         io = mempool_alloc(log->io_pool, GFP_ATOMIC);
 778         if (!io)
 779                 return NULL;
 780         memset(io, 0, sizeof(*io));
 781
 782         io->log = log;
 783         INIT_LIST_HEAD(&io->log_sibling);
 784         INIT_LIST_HEAD(&io->stripe_list);
 785         bio_list_init(&io->flush_barriers);
 786         io->state = IO_UNIT_RUNNING;
 787
 788         io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 789         block = page_address(io->meta_page);
 790         clear_page(block);
 791         block->magic = cpu_to_le32(R5LOG_MAGIC);
 792         block->version = R5LOG_VERSION;
 793         block->seq = cpu_to_le64(log->seq);
 794         block->position = cpu_to_le64(log->log_start);
 795
 796         io->log_start = log->log_start;
 797         io->meta_offset = sizeof(struct r5l_meta_block);
 798         io->seq = log->seq++;
 799
 800         io->current_bio = r5l_bio_alloc(log);
 801         io->current_bio->bi_end_io = r5l_log_endio;
 802         io->current_bio->bi_private = io;
 803         bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
 804
 805         r5_reserve_log_entry(log, io);
 806
 807         spin_lock_irq(&log->io_list_lock);
 808         list_add_tail(&io->log_sibling, &log->running_ios);
 809         spin_unlock_irq(&log->io_list_lock);
 810
 811         return io;
 812 }
 813
 814 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 815 {
 816         if (log->current_io &&
 817             log->current_io->meta_offset + payload_size > PAGE_SIZE)
 818                 r5l_submit_current_io(log);
 819
 820         if (!log->current_io) {
 821                 log->current_io = r5l_new_meta(log);
 822                 if (!log->current_io)
 823                         return -ENOMEM;
 824         }
 825
 826         return 0;
 827 }
 828
 829 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
 830                                     sector_t location,
 831                                     u32 checksum1, u32 checksum2,
 832                                     bool checksum2_valid)
 833 {
 834         struct r5l_io_unit *io = log->current_io;
 835         struct r5l_payload_data_parity *payload;
 836
 837         payload = page_address(io->meta_page) + io->meta_offset;
 838         payload->header.type = cpu_to_le16(type);
 839         payload->header.flags = cpu_to_le16(0);
 840         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
 841                                     (PAGE_SHIFT - 9));
 842         payload->location = cpu_to_le64(location);
 843         payload->checksum[0] = cpu_to_le32(checksum1);
 844         if (checksum2_valid)
 845                 payload->checksum[1] = cpu_to_le32(checksum2);
 846
 847         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
 848                 sizeof(__le32) * (1 + !!checksum2_valid);
 849 }
 850
 851 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 852 {
 853         struct r5l_io_unit *io = log->current_io;
 854
 855         if (io->need_split_bio) {
 856                 BUG_ON(io->split_bio);
 857                 io->split_bio = io->current_bio;
 858                 io->current_bio = r5l_bio_alloc(log);
 859                 bio_chain(io->current_bio, io->split_bio);
 860                 io->need_split_bio = false;
 861         }
 862
 863         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
 864                 BUG();
 865
 866         r5_reserve_log_entry(log, io);
 867 }
 868
 869 static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
 870 {
 871         struct mddev *mddev = log->rdev->mddev;
 872         struct r5conf *conf = mddev->private;
 873         struct r5l_io_unit *io;
 874         struct r5l_payload_flush *payload;
 875         int meta_size;
 876
 877         /*
 878          * payload_flush requires extra writes to the journal.
 879          * To avoid handling the extra IO in quiesce, just skip
 880          * flush_payload
 881          */
 882         if (conf->quiesce)
 883                 return;
 884
 885         mutex_lock(&log->io_mutex);
 886         meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
 887
 888         if (r5l_get_meta(log, meta_size)) {
 889                 mutex_unlock(&log->io_mutex);
 890                 return;
 891         }
 892
 893         /* current implementation is one stripe per flush payload */
 894         io = log->current_io;
 895         payload = page_address(io->meta_page) + io->meta_offset;
 896         payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
 897         payload->header.flags = cpu_to_le16(0);
 898         payload->size = cpu_to_le32(sizeof(__le64));
 899         payload->flush_stripes[0] = cpu_to_le64(sect);
 900         io->meta_offset += meta_size;
 901         /* multiple flush payloads count as one pending_stripe */
 902         if (!io->has_flush_payload) {
 903                 io->has_flush_payload = 1;
 904                 atomic_inc(&io->pending_stripe);
 905         }
 906         mutex_unlock(&log->io_mutex);
 907 }
 908
 909 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 910                            int data_pages, int parity_pages)
 911 {
 912         int i;
 913         int meta_size;
 914         int ret;
 915         struct r5l_io_unit *io;
 916
 917         meta_size =
 918                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 919                  * data_pages) +
 920                 sizeof(struct r5l_payload_data_parity) +
 921                 sizeof(__le32) * parity_pages;
 922
 923         ret = r5l_get_meta(log, meta_size);
 924         if (ret)
 925                 return ret;
 926
 927         io = log->current_io;
 928
 929         if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
 930                 io->has_flush = 1;
 931
 932         for (i = 0; i < sh->disks; i++) {
 933                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 934                     test_bit(R5_InJournal, &sh->dev[i].flags))
 935                         continue;
 936                 if (i == sh->pd_idx || i == sh->qd_idx)
 937                         continue;
 938                 if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
 939                     log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
 940                         io->has_fua = 1;
 941                         /*
 942                          * we need to flush journal to make sure recovery can
 943                          * reach the data with fua flag
 944                          */
 945                         io->has_flush = 1;
 946                 }
 947                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 948                                         raid5_compute_blocknr(sh, i, 0),
 949                                         sh->dev[i].log_checksum, 0, false);
 950                 r5l_append_payload_page(log, sh->dev[i].page);
 951         }
 952
 953         if (parity_pages == 2) {
 954                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 955                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 956                                         sh->dev[sh->qd_idx].log_checksum, true);
 957                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 958                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
 959         } else if (parity_pages == 1) {
 960                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 961                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 962                                         0, false);
 963                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 964         } else  /* Just writing data, not parity, in caching phase */
 965                 BUG_ON(parity_pages != 0);
 966
 967         list_add_tail(&sh->log_list, &io->stripe_list);
 968         atomic_inc(&io->pending_stripe);
 969         sh->log_io = io;
 970
 971         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 972                 return 0;
 973
 974         if (sh->log_start == MaxSector) {
 975                 BUG_ON(!list_empty(&sh->r5c));
 976                 sh->log_start = io->log_start;
 977                 spin_lock_irq(&log->stripe_in_journal_lock);
 978                 list_add_tail(&sh->r5c,
 979                               &log->stripe_in_journal_list);
 980                 spin_unlock_irq(&log->stripe_in_journal_lock);
 981                 atomic_inc(&log->stripe_in_journal_count);
 982         }
 983         return 0;
 984 }
 985
 986 /* add stripe to no_space_stripes, and then wake up reclaim */
 987 static inline void r5l_add_no_space_stripe(struct r5l_log *log,
 988                                            struct stripe_head *sh)
 989 {
 990         spin_lock(&log->no_space_stripes_lock);
 991         list_add_tail(&sh->log_list, &log->no_space_stripes);
 992         spin_unlock(&log->no_space_stripes_lock);
 993 }
 994
 995 /*
 996  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 997  * data from log to raid disks), so we shouldn't wait for reclaim here
 998  */
 999 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
1000 {
1001         struct r5conf *conf = sh->raid_conf;
1002         int write_disks = 0;
1003         int data_pages, parity_pages;
1004         int reserve;
1005         int i;
1006         int ret = 0;
1007         bool wake_reclaim = false;
1008
1009         if (!log)
1010                 return -EAGAIN;
1011         /* Don't support stripe batch */
1012         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
1013             test_bit(STRIPE_SYNCING, &sh->state)) {
1014                 /* the stripe is written to log, we start writing it to raid */
1015                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
1016                 return -EAGAIN;
1017         }
1018
1019         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1020
1021         for (i = 0; i < sh->disks; i++) {
1022                 void *addr;
1023
1024                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
1025                     test_bit(R5_InJournal, &sh->dev[i].flags))
1026                         continue;
1027
1028                 write_disks++;
1029                 /* checksum is already calculated in last run */
1030                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
1031                         continue;
1032                 addr = kmap_atomic(sh->dev[i].page);
1033                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1034                                                     addr, PAGE_SIZE);
1035                 kunmap_atomic(addr);
1036         }
1037         parity_pages = 1 + !!(sh->qd_idx >= 0);
1038         data_pages = write_disks - parity_pages;
1039
1040         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1041         /*
1042          * The stripe must enter state machine again to finish the write, so
1043          * don't delay.
1044          */
1045         clear_bit(STRIPE_DELAYED, &sh->state);
1046         atomic_inc(&sh->count);
1047
1048         mutex_lock(&log->io_mutex);
1049         /* meta + data */
1050         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
1051
1052         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1053                 if (!r5l_has_free_space(log, reserve)) {
1054                         r5l_add_no_space_stripe(log, sh);
1055                         wake_reclaim = true;
1056                 } else {
1057                         ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1058                         if (ret) {
1059                                 spin_lock_irq(&log->io_list_lock);
1060                                 list_add_tail(&sh->log_list,
1061                                               &log->no_mem_stripes);
1062                                 spin_unlock_irq(&log->io_list_lock);
1063                         }
1064                 }
1065         } else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
1066                 /*
1067                  * log space critical, do not process stripes that are
1068                  * not in cache yet (sh->log_start == MaxSector).
1069                  */
1070                 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1071                     sh->log_start == MaxSector) {
1072                         r5l_add_no_space_stripe(log, sh);
1073                         wake_reclaim = true;
1074                         reserve = 0;
1075                 } else if (!r5l_has_free_space(log, reserve)) {
1076                         if (sh->log_start == log->last_checkpoint)
1077                                 BUG();
1078                         else
1079                                 r5l_add_no_space_stripe(log, sh);
1080                 } else {
1081                         ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1082                         if (ret) {
1083                                 spin_lock_irq(&log->io_list_lock);
1084                                 list_add_tail(&sh->log_list,
1085                                               &log->no_mem_stripes);
1086                                 spin_unlock_irq(&log->io_list_lock);
1087                         }
1088                 }
1089         }
1090
1091         mutex_unlock(&log->io_mutex);
1092         if (wake_reclaim)
1093                 r5l_wake_reclaim(log, reserve);
1094         return 0;
1095 }
1096
1097 void r5l_write_stripe_run(struct r5l_log *log)
1098 {
1099         if (!log)
1100                 return;
1101         mutex_lock(&log->io_mutex);
1102         r5l_submit_current_io(log);
1103         mutex_unlock(&log->io_mutex);
1104 }
1105
1106 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
1107 {
1108         if (!log)
1109                 return -ENODEV;
1110
1111         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1112                 /*
1113                  * in write through (journal only)
1114                  * we flush log disk cache first, then write stripe data to
1115                  * raid disks. So if bio is finished, the log disk cache is
1116                  * flushed already. The recovery guarantees we can recovery
1117                  * the bio from log disk, so we don't need to flush again
1118                  */
1119                 if (bio->bi_iter.bi_size == 0) {
1120                         bio_endio(bio);
1121                         return 0;
1122                 }
1123                 bio->bi_opf &= ~REQ_PREFLUSH;
1124         } else {
1125                 /* write back (with cache) */
1126                 if (bio->bi_iter.bi_size == 0) {
1127                         mutex_lock(&log->io_mutex);
1128                         r5l_get_meta(log, 0);
1129                         bio_list_add(&log->current_io->flush_barriers, bio);
1130                         log->current_io->has_flush = 1;
1131                         log->current_io->has_null_flush = 1;
1132                         atomic_inc(&log->current_io->pending_stripe);
1133                         r5l_submit_current_io(log);
1134                         mutex_unlock(&log->io_mutex);
1135                         return 0;
1136                 }
1137         }
1138         return -EAGAIN;
1139 }
1140
1141 /* This will run after log space is reclaimed */
1142 static void r5l_run_no_space_stripes(struct r5l_log *log)
1143 {
1144         struct stripe_head *sh;
1145
1146         spin_lock(&log->no_space_stripes_lock);
1147         while (!list_empty(&log->no_space_stripes)) {
1148                 sh = list_first_entry(&log->no_space_stripes,
1149                                       struct stripe_head, log_list);
1150                 list_del_init(&sh->log_list);
1151                 set_bit(STRIPE_HANDLE, &sh->state);
1152                 raid5_release_stripe(sh);
1153         }
1154         spin_unlock(&log->no_space_stripes_lock);
1155 }
1156
1157 /*
1158  * calculate new last_checkpoint
1159  * for write through mode, returns log->next_checkpoint
1160  * for write back, returns log_start of first sh in stripe_in_journal_list
1161  */
1162 static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1163 {
1164         struct stripe_head *sh;
1165         struct r5l_log *log = conf->log;
1166         sector_t new_cp;
1167         unsigned long flags;
1168
1169         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1170                 return log->next_checkpoint;
1171
1172         spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1173         if (list_empty(&conf->log->stripe_in_journal_list)) {
1174                 /* all stripes flushed */
1175                 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1176                 return log->next_checkpoint;
1177         }
1178         sh = list_first_entry(&conf->log->stripe_in_journal_list,
1179                               struct stripe_head, r5c);
1180         new_cp = sh->log_start;
1181         spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1182         return new_cp;
1183 }
1184
1185 static sector_t r5l_reclaimable_space(struct r5l_log *log)
1186 {
1187         struct r5conf *conf = log->rdev->mddev->private;
1188
1189         return r5l_ring_distance(log, log->last_checkpoint,
1190                                  r5c_calculate_new_cp(conf));
1191 }
1192
1193 static void r5l_run_no_mem_stripe(struct r5l_log *log)
1194 {
1195         struct stripe_head *sh;
1196
1197         assert_spin_locked(&log->io_list_lock);
1198
1199         if (!list_empty(&log->no_mem_stripes)) {
1200                 sh = list_first_entry(&log->no_mem_stripes,
1201                                       struct stripe_head, log_list);
1202                 list_del_init(&sh->log_list);
1203                 set_bit(STRIPE_HANDLE, &sh->state);
1204                 raid5_release_stripe(sh);
1205         }
1206 }
1207
1208 static bool r5l_complete_finished_ios(struct r5l_log *log)
1209 {
1210         struct r5l_io_unit *io, *next;
1211         bool found = false;
1212
1213         assert_spin_locked(&log->io_list_lock);
1214
1215         list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1216                 /* don't change list order */
1217                 if (io->state < IO_UNIT_STRIPE_END)
1218                         break;
1219
1220                 log->next_checkpoint = io->log_start;
1221
1222                 list_del(&io->log_sibling);
1223                 mempool_free(io, log->io_pool);
1224                 r5l_run_no_mem_stripe(log);
1225
1226                 found = true;
1227         }
1228
1229         return found;
1230 }
1231
1232 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
1233 {
1234         struct r5l_log *log = io->log;
1235         struct r5conf *conf = log->rdev->mddev->private;
1236         unsigned long flags;
1237
1238         spin_lock_irqsave(&log->io_list_lock, flags);
1239         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
1240
1241         if (!r5l_complete_finished_ios(log)) {
1242                 spin_unlock_irqrestore(&log->io_list_lock, flags);
1243                 return;
1244         }
1245
1246         if (r5l_reclaimable_space(log) > log->max_free_space ||
1247             test_bit(R5C_LOG_TIGHT, &conf->cache_state))
1248                 r5l_wake_reclaim(log, 0);
1249
1250         spin_unlock_irqrestore(&log->io_list_lock, flags);
1251         wake_up(&log->iounit_wait);
1252 }
1253
1254 void r5l_stripe_write_finished(struct stripe_head *sh)
1255 {
1256         struct r5l_io_unit *io;
1257
1258         io = sh->log_io;
1259         sh->log_io = NULL;
1260
1261         if (io && atomic_dec_and_test(&io->pending_stripe))
1262                 __r5l_stripe_write_finished(io);
1263 }
1264
1265 static void r5l_log_flush_endio(struct bio *bio)
1266 {
1267         struct r5l_log *log = container_of(bio, struct r5l_log,
1268                 flush_bio);
1269         unsigned long flags;
1270         struct r5l_io_unit *io;
1271
1272         if (bio->bi_status)
1273                 md_error(log->rdev->mddev, log->rdev);
1274
1275         spin_lock_irqsave(&log->io_list_lock, flags);
1276         list_for_each_entry(io, &log->flushing_ios, log_sibling)
1277                 r5l_io_run_stripes(io);
1278         list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1279         spin_unlock_irqrestore(&log->io_list_lock, flags);
1280 }
1281
1282 /*
1283  * Starting dispatch IO to raid.
1284  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
1285  * broken meta in the middle of a log causes recovery can't find meta at the
1286  * head of log. If operations require meta at the head persistent in log, we
1287  * must make sure meta before it persistent in log too. A case is:
1288  *
1289  * stripe data/parity is in log, we start write stripe to raid disks. stripe
1290  * data/parity must be persistent in log before we do the write to raid disks.
1291  *
1292  * The solution is we restrictly maintain io_unit list order. In this case, we
1293  * only write stripes of an io_unit to raid disks till the io_unit is the first
1294  * one whose data/parity is in log.
1295  */
1296 void r5l_flush_stripe_to_raid(struct r5l_log *log)
1297 {
1298         bool do_flush;
1299
1300         if (!log || !log->need_cache_flush)
1301                 return;
1302
1303         spin_lock_irq(&log->io_list_lock);
1304         /* flush bio is running */
1305         if (!list_empty(&log->flushing_ios)) {
1306                 spin_unlock_irq(&log->io_list_lock);
1307                 return;
1308         }
1309         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1310         do_flush = !list_empty(&log->flushing_ios);
1311         spin_unlock_irq(&log->io_list_lock);
1312
1313         if (!do_flush)
1314                 return;
1315         bio_reset(&log->flush_bio);
1316         bio_set_dev(&log->flush_bio, log->rdev->bdev);
1317         log->flush_bio.bi_end_io = r5l_log_flush_endio;
1318         log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1319         submit_bio(&log->flush_bio);
1320 }
1321
1322 static void r5l_write_super(struct r5l_log *log, sector_t cp);
1323 static void r5l_write_super_and_discard_space(struct r5l_log *log,
1324         sector_t end)
1325 {
1326         struct block_device *bdev = log->rdev->bdev;
1327         struct mddev *mddev;
1328
1329         r5l_write_super(log, end);
1330
1331         if (!blk_queue_discard(bdev_get_queue(bdev)))
1332                 return;
1333
1334         mddev = log->rdev->mddev;
1335         /*
1336          * Discard could zero data, so before discard we must make sure
1337          * superblock is updated to new log tail. Updating superblock (either
1338          * directly call md_update_sb() or depend on md thread) must hold
1339          * reconfig mutex. On the other hand, raid5_quiesce is called with
1340          * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
1341          * for all IO finish, hence waitting for reclaim thread, while reclaim
1342          * thread is calling this function and waitting for reconfig mutex. So
1343          * there is a deadlock. We workaround this issue with a trylock.
1344          * FIXME: we could miss discard if we can't take reconfig mutex
1345          */
1346         set_mask_bits(&mddev->sb_flags, 0,
1347                 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1348         if (!mddev_trylock(mddev))
1349                 return;
1350         md_update_sb(mddev, 1);
1351         mddev_unlock(mddev);
1352
1353         /* discard IO error really doesn't matter, ignore it */
1354         if (log->last_checkpoint < end) {
1355                 blkdev_issue_discard(bdev,
1356                                 log->last_checkpoint + log->rdev->data_offset,
1357                                 end - log->last_checkpoint, GFP_NOIO, 0);
1358         } else {
1359                 blkdev_issue_discard(bdev,
1360                                 log->last_checkpoint + log->rdev->data_offset,
1361                                 log->device_size - log->last_checkpoint,
1362                                 GFP_NOIO, 0);
1363                 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1364                                 GFP_NOIO, 0);
1365         }
1366 }
1367
1368 /*
1369  * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1370  * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1371  *
1372  * must hold conf->device_lock
1373  */
1374 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1375 {
1376         BUG_ON(list_empty(&sh->lru));
1377         BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1378         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1379
1380         /*
1381          * The stripe is not ON_RELEASE_LIST, so it is safe to call
1382          * raid5_release_stripe() while holding conf->device_lock
1383          */
1384         BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1385         assert_spin_locked(&conf->device_lock);
1386
1387         list_del_init(&sh->lru);
1388         atomic_inc(&sh->count);
1389
1390         set_bit(STRIPE_HANDLE, &sh->state);
1391         atomic_inc(&conf->active_stripes);
1392         r5c_make_stripe_write_out(sh);
1393
1394         if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
1395                 atomic_inc(&conf->r5c_flushing_partial_stripes);
1396         else
1397                 atomic_inc(&conf->r5c_flushing_full_stripes);
1398         raid5_release_stripe(sh);
1399 }
1400
1401 /*
1402  * if num == 0, flush all full stripes
1403  * if num > 0, flush all full stripes. If less than num full stripes are
1404  *             flushed, flush some partial stripes until totally num stripes are
1405  *             flushed or there is no more cached stripes.
1406  */
1407 void r5c_flush_cache(struct r5conf *conf, int num)
1408 {
1409         int count;
1410         struct stripe_head *sh, *next;
1411
1412         assert_spin_locked(&conf->device_lock);
1413         if (!conf->log)
1414                 return;
1415
1416         count = 0;
1417         list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1418                 r5c_flush_stripe(conf, sh);
1419                 count++;
1420         }
1421
1422         if (count >= num)
1423                 return;
1424         list_for_each_entry_safe(sh, next,
1425                                  &conf->r5c_partial_stripe_list, lru) {
1426                 r5c_flush_stripe(conf, sh);
1427                 if (++count >= num)
1428                         break;
1429         }
1430 }
1431
1432 static void r5c_do_reclaim(struct r5conf *conf)
1433 {
1434         struct r5l_log *log = conf->log;
1435         struct stripe_head *sh;
1436         int count = 0;
1437         unsigned long flags;
1438         int total_cached;
1439         int stripes_to_flush;
1440         int flushing_partial, flushing_full;
1441
1442         if (!r5c_is_writeback(log))
1443                 return;
1444
1445         flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
1446         flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
1447         total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1448                 atomic_read(&conf->r5c_cached_full_stripes) -
1449                 flushing_full - flushing_partial;
1450
1451         if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1452             atomic_read(&conf->empty_inactive_list_nr) > 0)
1453                 /*
1454                  * if stripe cache pressure high, flush all full stripes and
1455                  * some partial stripes
1456                  */
1457                 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1458         else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1459                  atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
1460                  R5C_FULL_STRIPE_FLUSH_BATCH(conf))
1461                 /*
1462                  * if stripe cache pressure moderate, or if there is many full
1463                  * stripes,flush all full stripes
1464                  */
1465                 stripes_to_flush = 0;
1466         else
1467                 /* no need to flush */
1468                 stripes_to_flush = -1;
1469
1470         if (stripes_to_flush >= 0) {
1471                 spin_lock_irqsave(&conf->device_lock, flags);
1472                 r5c_flush_cache(conf, stripes_to_flush);
1473                 spin_unlock_irqrestore(&conf->device_lock, flags);
1474         }
1475
1476         /* if log space is tight, flush stripes on stripe_in_journal_list */
1477         if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1478                 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1479                 spin_lock(&conf->device_lock);
1480                 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1481                         /*
1482                          * stripes on stripe_in_journal_list could be in any
1483                          * state of the stripe_cache state machine. In this
1484                          * case, we only want to flush stripe on
1485                          * r5c_cached_full/partial_stripes. The following
1486                          * condition makes sure the stripe is on one of the
1487                          * two lists.
1488                          */
1489                         if (!list_empty(&sh->lru) &&
1490                             !test_bit(STRIPE_HANDLE, &sh->state) &&
1491                             atomic_read(&sh->count) == 0) {
1492                                 r5c_flush_stripe(conf, sh);
1493                                 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1494                                         break;
1495                         }
1496                 }
1497                 spin_unlock(&conf->device_lock);
1498                 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1499         }
1500
1501         if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1502                 r5l_run_no_space_stripes(log);
1503
1504         md_wakeup_thread(conf->mddev->thread);
1505 }
1506
1507 static void r5l_do_reclaim(struct r5l_log *log)
1508 {
1509         struct r5conf *conf = log->rdev->mddev->private;
1510         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1511         sector_t reclaimable;
1512         sector_t next_checkpoint;
1513         bool write_super;
1514
1515         spin_lock_irq(&log->io_list_lock);
1516         write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1517                 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1518         /*
1519          * move proper io_unit to reclaim list. We should not change the order.
1520          * reclaimable/unreclaimable io_unit can be mixed in the list, we
1521          * shouldn't reuse space of an unreclaimable io_unit
1522          */
1523         while (1) {
1524                 reclaimable = r5l_reclaimable_space(log);
1525                 if (reclaimable >= reclaim_target ||
1526                     (list_empty(&log->running_ios) &&
1527                      list_empty(&log->io_end_ios) &&
1528                      list_empty(&log->flushing_ios) &&
1529                      list_empty(&log->finished_ios)))
1530                         break;
1531
1532                 md_wakeup_thread(log->rdev->mddev->thread);
1533                 wait_event_lock_irq(log->iounit_wait,
1534                                     r5l_reclaimable_space(log) > reclaimable,
1535                                     log->io_list_lock);
1536         }
1537
1538         next_checkpoint = r5c_calculate_new_cp(conf);
1539         spin_unlock_irq(&log->io_list_lock);
1540
1541         if (reclaimable == 0 || !write_super)
1542                 return;
1543
1544         /*
1545          * write_super will flush cache of each raid disk. We must write super
1546          * here, because the log area might be reused soon and we don't want to
1547          * confuse recovery
1548          */
1549         r5l_write_super_and_discard_space(log, next_checkpoint);
1550
1551         mutex_lock(&log->io_mutex);
1552         log->last_checkpoint = next_checkpoint;
1553         r5c_update_log_state(log);
1554         mutex_unlock(&log->io_mutex);
1555
1556         r5l_run_no_space_stripes(log);
1557 }
1558
1559 static void r5l_reclaim_thread(struct md_thread *thread)
1560 {
1561         struct mddev *mddev = thread->mddev;
1562         struct r5conf *conf = mddev->private;
1563         struct r5l_log *log = conf->log;
1564
1565         if (!log)
1566                 return;
1567         r5c_do_reclaim(conf);
1568         r5l_do_reclaim(log);
1569 }
1570
1571 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1572 {
1573         unsigned long target;
1574         unsigned long new = (unsigned long)space; /* overflow in theory */
1575
1576         if (!log)
1577                 return;
1578         do {
1579                 target = log->reclaim_target;
1580                 if (new < target)
1581                         return;
1582         } while (cmpxchg(&log->reclaim_target, target, new) != target);
1583         md_wakeup_thread(log->reclaim_thread);
1584 }
1585
1586 void r5l_quiesce(struct r5l_log *log, int state)
1587 {
1588         struct mddev *mddev;
1589         if (!log || state == 2)
1590                 return;
1591         if (state == 0)
1592                 kthread_unpark(log->reclaim_thread->tsk);
1593         else if (state == 1) {
1594                 /* make sure r5l_write_super_and_discard_space exits */
1595                 mddev = log->rdev->mddev;
1596                 wake_up(&mddev->sb_wait);
1597                 kthread_park(log->reclaim_thread->tsk);
1598                 r5l_wake_reclaim(log, MaxSector);
1599                 r5l_do_reclaim(log);
1600         }
1601 }
1602
1603 bool r5l_log_disk_error(struct r5conf *conf)
1604 {
1605         struct r5l_log *log;
1606         bool ret;
1607         /* don't allow write if journal disk is missing */
1608         rcu_read_lock();
1609         log = rcu_dereference(conf->log);
1610
1611         if (!log)
1612                 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1613         else
1614                 ret = test_bit(Faulty, &log->rdev->flags);
1615         rcu_read_unlock();
1616         return ret;
1617 }
1618
1619 #define R5L_RECOVERY_PAGE_POOL_SIZE 256
1620
1621 struct r5l_recovery_ctx {
1622         struct page *meta_page;         /* current meta */
1623         sector_t meta_total_blocks;     /* total size of current meta and data */
1624         sector_t pos;                   /* recovery position */
1625         u64 seq;                        /* recovery position seq */
1626         int data_parity_stripes;        /* number of data_parity stripes */
1627         int data_only_stripes;          /* number of data_only stripes */
1628         struct list_head cached_list;
1629
1630         /*
1631          * read ahead page pool (ra_pool)
1632          * in recovery, log is read sequentially. It is not efficient to
1633          * read every page with sync_page_io(). The read ahead page pool
1634          * reads multiple pages with one IO, so further log read can
1635          * just copy data from the pool.
1636          */
1637         struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
1638         sector_t pool_offset;   /* offset of first page in the pool */
1639         int total_pages;        /* total allocated pages */
1640         int valid_pages;        /* pages with valid data */
1641         struct bio *ra_bio;     /* bio to do the read ahead */
1642 };
1643
1644 static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
1645                                             struct r5l_recovery_ctx *ctx)
1646 {
1647         struct page *page;
1648
1649         ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
1650         if (!ctx->ra_bio)
1651                 return -ENOMEM;
1652
1653         ctx->valid_pages = 0;
1654         ctx->total_pages = 0;
1655         while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
1656                 page = alloc_page(GFP_KERNEL);
1657
1658                 if (!page)
1659                         break;
1660                 ctx->ra_pool[ctx->total_pages] = page;
1661                 ctx->total_pages += 1;
1662         }
1663
1664         if (ctx->total_pages == 0) {
1665                 bio_put(ctx->ra_bio);
1666                 return -ENOMEM;
1667         }
1668
1669         ctx->pool_offset = 0;
1670         return 0;
1671 }
1672
1673 static void r5l_recovery_free_ra_pool(struct r5l_log *log,
1674                                         struct r5l_recovery_ctx *ctx)
1675 {
1676         int i;
1677
1678         for (i = 0; i < ctx->total_pages; ++i)
1679                 put_page(ctx->ra_pool[i]);
1680         bio_put(ctx->ra_bio);
1681 }
1682
1683 /*
1684  * fetch ctx->valid_pages pages from offset
1685  * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
1686  * However, if the offset is close to the end of the journal device,
1687  * ctx->valid_pages could be smaller than ctx->total_pages
1688  */
1689 static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
1690                                       struct r5l_recovery_ctx *ctx,
1691                                       sector_t offset)
1692 {
1693         bio_reset(ctx->ra_bio);
1694         bio_set_dev(ctx->ra_bio, log->rdev->bdev);
1695         bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
1696         ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
1697
1698         ctx->valid_pages = 0;
1699         ctx->pool_offset = offset;
1700
1701         while (ctx->valid_pages < ctx->total_pages) {
1702                 bio_add_page(ctx->ra_bio,
1703                              ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
1704                 ctx->valid_pages += 1;
1705
1706                 offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
1707
1708                 if (offset == 0)  /* reached end of the device */
1709                         break;
1710         }
1711
1712         return submit_bio_wait(ctx->ra_bio);
1713 }
1714
1715 /*
1716  * try read a page from the read ahead page pool, if the page is not in the
1717  * pool, call r5l_recovery_fetch_ra_pool
1718  */
1719 static int r5l_recovery_read_page(struct r5l_log *log,
1720                                   struct r5l_recovery_ctx *ctx,
1721                                   struct page *page,
1722                                   sector_t offset)
1723 {
1724         int ret;
1725
1726         if (offset < ctx->pool_offset ||
1727             offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
1728                 ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
1729                 if (ret)
1730                         return ret;
1731         }
1732
1733         BUG_ON(offset < ctx->pool_offset ||
1734                offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
1735
1736         memcpy(page_address(page),
1737                page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
1738                                          BLOCK_SECTOR_SHIFT]),
1739                PAGE_SIZE);
1740         return 0;
1741 }
1742
1743 static int r5l_recovery_read_meta_block(struct r5l_log *log,
1744                                         struct r5l_recovery_ctx *ctx)
1745 {
1746         struct page *page = ctx->meta_page;
1747         struct r5l_meta_block *mb;
1748         u32 crc, stored_crc;
1749         int ret;
1750
1751         ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
1752         if (ret != 0)
1753                 return ret;
1754
1755         mb = page_address(page);
1756         stored_crc = le32_to_cpu(mb->checksum);
1757         mb->checksum = 0;
1758
1759         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1760             le64_to_cpu(mb->seq) != ctx->seq ||
1761             mb->version != R5LOG_VERSION ||
1762             le64_to_cpu(mb->position) != ctx->pos)
1763                 return -EINVAL;
1764
1765         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1766         if (stored_crc != crc)
1767                 return -EINVAL;
1768
1769         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1770                 return -EINVAL;
1771
1772         ctx->meta_total_blocks = BLOCK_SECTORS;
1773
1774         return 0;
1775 }
1776
1777 static void
1778 r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1779                                      struct page *page,
1780                                      sector_t pos, u64 seq)
1781 {
1782         struct r5l_meta_block *mb;
1783
1784         mb = page_address(page);
1785         clear_page(mb);
1786         mb->magic = cpu_to_le32(R5LOG_MAGIC);
1787         mb->version = R5LOG_VERSION;
1788         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1789         mb->seq = cpu_to_le64(seq);
1790         mb->position = cpu_to_le64(pos);
1791 }
1792
1793 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1794                                           u64 seq)
1795 {
1796         struct page *page;
1797         struct r5l_meta_block *mb;
1798
1799         page = alloc_page(GFP_KERNEL);
1800         if (!page)
1801                 return -ENOMEM;
1802         r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1803         mb = page_address(page);
1804         mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
1805                                              mb, PAGE_SIZE));
1806         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1807                           REQ_SYNC | REQ_FUA, false)) {
1808                 __free_page(page);
1809                 return -EIO;
1810         }
1811         __free_page(page);
1812         return 0;
1813 }
1814
1815 /*
1816  * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
1817  * to mark valid (potentially not flushed) data in the journal.
1818  *
1819  * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
1820  * so there should not be any mismatch here.
1821  */
1822 static void r5l_recovery_load_data(struct r5l_log *log,
1823                                    struct stripe_head *sh,
1824                                    struct r5l_recovery_ctx *ctx,
1825                                    struct r5l_payload_data_parity *payload,
1826                                    sector_t log_offset)
1827 {
1828         struct mddev *mddev = log->rdev->mddev;
1829         struct r5conf *conf = mddev->private;
1830         int dd_idx;
1831
1832         raid5_compute_sector(conf,
1833                              le64_to_cpu(payload->location), 0,
1834                              &dd_idx, sh);
1835         r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
1836         sh->dev[dd_idx].log_checksum =
1837                 le32_to_cpu(payload->checksum[0]);
1838         ctx->meta_total_blocks += BLOCK_SECTORS;
1839
1840         set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1841         set_bit(STRIPE_R5C_CACHING, &sh->state);
1842 }
1843
1844 static void r5l_recovery_load_parity(struct r5l_log *log,
1845                                      struct stripe_head *sh,
1846                                      struct r5l_recovery_ctx *ctx,
1847                                      struct r5l_payload_data_parity *payload,
1848                                      sector_t log_offset)
1849 {
1850         struct mddev *mddev = log->rdev->mddev;
1851         struct r5conf *conf = mddev->private;
1852
1853         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1854         r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
1855         sh->dev[sh->pd_idx].log_checksum =
1856                 le32_to_cpu(payload->checksum[0]);
1857         set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1858
1859         if (sh->qd_idx >= 0) {
1860                 r5l_recovery_read_page(
1861                         log, ctx, sh->dev[sh->qd_idx].page,
1862                         r5l_ring_add(log, log_offset, BLOCK_SECTORS));
1863                 sh->dev[sh->qd_idx].log_checksum =
1864                         le32_to_cpu(payload->checksum[1]);
1865                 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1866         }
1867         clear_bit(STRIPE_R5C_CACHING, &sh->state);
1868 }
1869
1870 static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1871 {
1872         int i;
1873
1874         sh->state = 0;
1875         sh->log_start = MaxSector;
1876         for (i = sh->disks; i--; )
1877                 sh->dev[i].flags = 0;
1878 }
1879
1880 static void
1881 r5l_recovery_replay_one_stripe(struct r5conf *conf,
1882                                struct stripe_head *sh,
1883                                struct r5l_recovery_ctx *ctx)
1884 {
1885         struct md_rdev *rdev, *rrdev;
1886         int disk_index;
1887         int data_count = 0;
1888
1889         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1890                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1891                         continue;
1892                 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1893                         continue;
1894                 data_count++;
1895         }
1896
1897         /*
1898          * stripes that only have parity must have been flushed
1899          * before the crash that we are now recovering from, so
1900          * there is nothing more to recovery.
1901          */
1902         if (data_count == 0)
1903                 goto out;
1904
1905         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1906                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1907                         continue;
1908
1909                 /* in case device is broken */
1910                 rcu_read_lock();
1911                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1912                 if (rdev) {
1913                         atomic_inc(&rdev->nr_pending);
1914                         rcu_read_unlock();
1915                         sync_page_io(rdev, sh->sector, PAGE_SIZE,
1916                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1917                                      false);
1918                         rdev_dec_pending(rdev, rdev->mddev);
1919                         rcu_read_lock();
1920                 }
1921                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1922                 if (rrdev) {
1923                         atomic_inc(&rrdev->nr_pending);
1924                         rcu_read_unlock();
1925                         sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1926                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1927                                      false);
1928                         rdev_dec_pending(rrdev, rrdev->mddev);
1929                         rcu_read_lock();
1930                 }
1931                 rcu_read_unlock();
1932         }
1933         ctx->data_parity_stripes++;
1934 out:
1935         r5l_recovery_reset_stripe(sh);
1936 }
1937
1938 static struct stripe_head *
1939 r5c_recovery_alloc_stripe(struct r5conf *conf,
1940                           sector_t stripe_sect)
1941 {
1942         struct stripe_head *sh;
1943
1944         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
1945         if (!sh)
1946                 return NULL;  /* no more stripe available */
1947
1948         r5l_recovery_reset_stripe(sh);
1949
1950         return sh;
1951 }
1952
1953 static struct stripe_head *
1954 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1955 {
1956         struct stripe_head *sh;
1957
1958         list_for_each_entry(sh, list, lru)
1959                 if (sh->sector == sect)
1960                         return sh;
1961         return NULL;
1962 }
1963
1964 static void
1965 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1966                           struct r5l_recovery_ctx *ctx)
1967 {
1968         struct stripe_head *sh, *next;
1969
1970         list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1971                 r5l_recovery_reset_stripe(sh);
1972                 list_del_init(&sh->lru);
1973                 raid5_release_stripe(sh);
1974         }
1975 }
1976
1977 static void
1978 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1979                             struct r5l_recovery_ctx *ctx)
1980 {
1981         struct stripe_head *sh, *next;
1982
1983         list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1984                 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1985                         r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1986                         list_del_init(&sh->lru);
1987                         raid5_release_stripe(sh);
1988                 }
1989 }
1990
1991 /* if matches return 0; otherwise return -EINVAL */
1992 static int
1993 r5l_recovery_verify_data_checksum(struct r5l_log *log,
1994                                   struct r5l_recovery_ctx *ctx,
1995                                   struct page *page,
1996                                   sector_t log_offset, __le32 log_checksum)
1997 {
1998         void *addr;
1999         u32 checksum;
2000
2001         r5l_recovery_read_page(log, ctx, page, log_offset);
2002         addr = kmap_atomic(page);
2003         checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
2004         kunmap_atomic(addr);
2005         return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
2006 }
2007
2008 /*
2009  * before loading data to stripe cache, we need verify checksum for all data,
2010  * if there is mismatch for any data page, we drop all data in the mata block
2011  */
2012 static int
2013 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
2014                                          struct r5l_recovery_ctx *ctx)
2015 {
2016         struct mddev *mddev = log->rdev->mddev;
2017         struct r5conf *conf = mddev->private;
2018         struct r5l_meta_block *mb = page_address(ctx->meta_page);
2019         sector_t mb_offset = sizeof(struct r5l_meta_block);
2020         sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2021         struct page *page;
2022         struct r5l_payload_data_parity *payload;
2023         struct r5l_payload_flush *payload_flush;
2024
2025         page = alloc_page(GFP_KERNEL);
2026         if (!page)
2027                 return -ENOMEM;
2028
2029         while (mb_offset < le32_to_cpu(mb->meta_size)) {
2030                 payload = (void *)mb + mb_offset;
2031                 payload_flush = (void *)mb + mb_offset;
2032
2033                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2034                         if (r5l_recovery_verify_data_checksum(
2035                                     log, ctx, page, log_offset,
2036                                     payload->checksum[0]) < 0)
2037                                 goto mismatch;
2038                 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
2039                         if (r5l_recovery_verify_data_checksum(
2040                                     log, ctx, page, log_offset,
2041                                     payload->checksum[0]) < 0)
2042                                 goto mismatch;
2043                         if (conf->max_degraded == 2 && /* q for RAID 6 */
2044                             r5l_recovery_verify_data_checksum(
2045                                     log, ctx, page,
2046                                     r5l_ring_add(log, log_offset,
2047                                                  BLOCK_SECTORS),
2048                                     payload->checksum[1]) < 0)
2049                                 goto mismatch;
2050                 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2051                         /* nothing to do for R5LOG_PAYLOAD_FLUSH here */
2052                 } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
2053                         goto mismatch;
2054
2055                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2056                         mb_offset += sizeof(struct r5l_payload_flush) +
2057                                 le32_to_cpu(payload_flush->size);
2058                 } else {
2059                         /* DATA or PARITY payload */
2060                         log_offset = r5l_ring_add(log, log_offset,
2061                                                   le32_to_cpu(payload->size));
2062                         mb_offset += sizeof(struct r5l_payload_data_parity) +
2063                                 sizeof(__le32) *
2064                                 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2065                 }
2066
2067         }
2068
2069         put_page(page);
2070         return 0;
2071
2072 mismatch:
2073         put_page(page);
2074         return -EINVAL;
2075 }
2076
2077 /*
2078  * Analyze all data/parity pages in one meta block
2079  * Returns:
2080  * 0 for success
2081  * -EINVAL for unknown playload type
2082  * -EAGAIN for checksum mismatch of data page
2083  * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
2084  */
2085 static int
2086 r5c_recovery_analyze_meta_block(struct r5l_log *log,
2087                                 struct r5l_recovery_ctx *ctx,
2088                                 struct list_head *cached_stripe_list)
2089 {
2090         struct mddev *mddev = log->rdev->mddev;
2091         struct r5conf *conf = mddev->private;
2092         struct r5l_meta_block *mb;
2093         struct r5l_payload_data_parity *payload;
2094         struct r5l_payload_flush *payload_flush;
2095         int mb_offset;
2096         sector_t log_offset;
2097         sector_t stripe_sect;
2098         struct stripe_head *sh;
2099         int ret;
2100
2101         /*
2102          * for mismatch in data blocks, we will drop all data in this mb, but
2103          * we will still read next mb for other data with FLUSH flag, as
2104          * io_unit could finish out of order.
2105          */
2106         ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
2107         if (ret == -EINVAL)
2108                 return -EAGAIN;
2109         else if (ret)
2110                 return ret;   /* -ENOMEM duo to alloc_page() failed */
2111
2112         mb = page_address(ctx->meta_page);
2113         mb_offset = sizeof(struct r5l_meta_block);
2114         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2115
2116         while (mb_offset < le32_to_cpu(mb->meta_size)) {
2117                 int dd;
2118
2119                 payload = (void *)mb + mb_offset;
2120                 payload_flush = (void *)mb + mb_offset;
2121
2122                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2123                         int i, count;
2124
2125                         count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
2126                         for (i = 0; i < count; ++i) {
2127                                 stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
2128                                 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2129                                                                 stripe_sect);
2130                                 if (sh) {
2131                                         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2132                                         r5l_recovery_reset_stripe(sh);
2133                                         list_del_init(&sh->lru);
2134                                         raid5_release_stripe(sh);
2135                                 }
2136                         }
2137
2138                         mb_offset += sizeof(struct r5l_payload_flush) +
2139                                 le32_to_cpu(payload_flush->size);
2140                         continue;
2141                 }
2142
2143                 /* DATA or PARITY payload */
2144                 stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
2145                         raid5_compute_sector(
2146                                 conf, le64_to_cpu(payload->location), 0, &dd,
2147                                 NULL)
2148                         : le64_to_cpu(payload->location);
2149
2150                 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2151                                                 stripe_sect);
2152
2153                 if (!sh) {
2154                         sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
2155                         /*
2156                          * cannot get stripe from raid5_get_active_stripe
2157                          * try replay some stripes
2158                          */
2159                         if (!sh) {
2160                                 r5c_recovery_replay_stripes(
2161                                         cached_stripe_list, ctx);
2162                                 sh = r5c_recovery_alloc_stripe(
2163                                         conf, stripe_sect);
2164                         }
2165                         if (!sh) {
2166                                 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
2167                                         mdname(mddev),
2168                                         conf->min_nr_stripes * 2);
2169                                 raid5_set_cache_size(mddev,
2170                                                      conf->min_nr_stripes * 2);
2171                                 sh = r5c_recovery_alloc_stripe(conf,
2172                                                                stripe_sect);
2173                         }
2174                         if (!sh) {
2175                                 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
2176                                        mdname(mddev));
2177                                 return -ENOMEM;
2178                         }
2179                         list_add_tail(&sh->lru, cached_stripe_list);
2180                 }
2181
2182                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2183                         if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
2184                             test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
2185                                 r5l_recovery_replay_one_stripe(conf, sh, ctx);
2186                                 list_move_tail(&sh->lru, cached_stripe_list);
2187                         }
2188                         r5l_recovery_load_data(log, sh, ctx, payload,
2189                                                log_offset);
2190                 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
2191                         r5l_recovery_load_parity(log, sh, ctx, payload,
2192                                                  log_offset);
2193                 else
2194                         return -EINVAL;
2195
2196                 log_offset = r5l_ring_add(log, log_offset,
2197                                           le32_to_cpu(payload->size));
2198
2199                 mb_offset += sizeof(struct r5l_payload_data_parity) +
2200                         sizeof(__le32) *
2201                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2202         }
2203
2204         return 0;
2205 }
2206
2207 /*
2208  * Load the stripe into cache. The stripe will be written out later by
2209  * the stripe cache state machine.
2210  */
2211 static void r5c_recovery_load_one_stripe(struct r5l_log *log,
2212                                          struct stripe_head *sh)
2213 {
2214         struct r5dev *dev;
2215         int i;
2216
2217         for (i = sh->disks; i--; ) {
2218                 dev = sh->dev + i;
2219                 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
2220                         set_bit(R5_InJournal, &dev->flags);
2221                         set_bit(R5_UPTODATE, &dev->flags);
2222                 }
2223         }
2224 }
2225
2226 /*
2227  * Scan through the log for all to-be-flushed data
2228  *
2229  * For stripes with data and parity, namely Data-Parity stripe
2230  * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
2231  *
2232  * For stripes with only data, namely Data-Only stripe
2233  * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
2234  *
2235  * For a stripe, if we see data after parity, we should discard all previous
2236  * data and parity for this stripe, as these data are already flushed to
2237  * the array.
2238  *
2239  * At the end of the scan, we return the new journal_tail, which points to
2240  * first data-only stripe on the journal device, or next invalid meta block.
2241  */
2242 static int r5c_recovery_flush_log(struct r5l_log *log,
2243                                   struct r5l_recovery_ctx *ctx)
2244 {
2245         struct stripe_head *sh;
2246         int ret = 0;
2247
2248         /* scan through the log */
2249         while (1) {
2250                 if (r5l_recovery_read_meta_block(log, ctx))
2251                         break;
2252
2253                 ret = r5c_recovery_analyze_meta_block(log, ctx,
2254                                                       &ctx->cached_list);
2255                 /*
2256                  * -EAGAIN means mismatch in data block, in this case, we still
2257                  * try scan the next metablock
2258                  */
2259                 if (ret && ret != -EAGAIN)
2260                         break;   /* ret == -EINVAL or -ENOMEM */
2261                 ctx->seq++;
2262                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
2263         }
2264
2265         if (ret == -ENOMEM) {
2266                 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
2267                 return ret;
2268         }
2269
2270         /* replay data-parity stripes */
2271         r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
2272
2273         /* load data-only stripes to stripe cache */
2274         list_for_each_entry(sh, &ctx->cached_list, lru) {
2275                 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2276                 r5c_recovery_load_one_stripe(log, sh);
2277                 ctx->data_only_stripes++;
2278         }
2279
2280         return 0;
2281 }
2282
2283 /*
2284  * we did a recovery. Now ctx.pos points to an invalid meta block. New
2285  * log will start here. but we can't let superblock point to last valid
2286  * meta block. The log might looks like:
2287  * | meta 1| meta 2| meta 3|
2288  * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
2289  * superblock points to meta 1, we write a new valid meta 2n.  if crash
2290  * happens again, new recovery will start from meta 1. Since meta 2n is
2291  * valid now, recovery will think meta 3 is valid, which is wrong.
2292  * The solution is we create a new meta in meta2 with its seq == meta
2293  * 1's seq + 10000 and let superblock points to meta2. The same recovery
2294  * will not think meta 3 is a valid meta, because its seq doesn't match
2295  */
2296
2297 /*
2298  * Before recovery, the log looks like the following
2299  *
2300  *   ---------------------------------------------
2301  *   |           valid log        | invalid log  |
2302  *   ---------------------------------------------
2303  *   ^
2304  *   |- log->last_checkpoint
2305  *   |- log->last_cp_seq
2306  *
2307  * Now we scan through the log until we see invalid entry
2308  *
2309  *   ---------------------------------------------
2310  *   |           valid log        | invalid log  |
2311  *   ---------------------------------------------
2312  *   ^                            ^
2313  *   |- log->last_checkpoint      |- ctx->pos
2314  *   |- log->last_cp_seq          |- ctx->seq
2315  *
2316  * From this point, we need to increase seq number by 10 to avoid
2317  * confusing next recovery.
2318  *
2319  *   ---------------------------------------------
2320  *   |           valid log        | invalid log  |
2321  *   ---------------------------------------------
2322  *   ^                              ^
2323  *   |- log->last_checkpoint        |- ctx->pos+1
2324  *   |- log->last_cp_seq            |- ctx->seq+10001
2325  *
2326  * However, it is not safe to start the state machine yet, because data only
2327  * parities are not yet secured in RAID. To save these data only parities, we
2328  * rewrite them from seq+11.
2329  *
2330  *   -----------------------------------------------------------------
2331  *   |           valid log        | data only stripes | invalid log  |
2332  *   -----------------------------------------------------------------
2333  *   ^                                                ^
2334  *   |- log->last_checkpoint                          |- ctx->pos+n
2335  *   |- log->last_cp_seq                              |- ctx->seq+10000+n
2336  *
2337  * If failure happens again during this process, the recovery can safe start
2338  * again from log->last_checkpoint.
2339  *
2340  * Once data only stripes are rewritten to journal, we move log_tail
2341  *
2342  *   -----------------------------------------------------------------
2343  *   |     old log        |    data only stripes    | invalid log  |
2344  *   -----------------------------------------------------------------
2345  *                        ^                         ^
2346  *                        |- log->last_checkpoint   |- ctx->pos+n
2347  *                        |- log->last_cp_seq       |- ctx->seq+10000+n
2348  *
2349  * Then we can safely start the state machine. If failure happens from this
2350  * point on, the recovery will start from new log->last_checkpoint.
2351  */
2352 static int
2353 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2354                                        struct r5l_recovery_ctx *ctx)
2355 {
2356         struct stripe_head *sh;
2357         struct mddev *mddev = log->rdev->mddev;
2358         struct page *page;
2359         sector_t next_checkpoint = MaxSector;
2360
2361         page = alloc_page(GFP_KERNEL);
2362         if (!page) {
2363                 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2364                        mdname(mddev));
2365                 return -ENOMEM;
2366         }
2367
2368         WARN_ON(list_empty(&ctx->cached_list));
2369
2370         list_for_each_entry(sh, &ctx->cached_list, lru) {
2371                 struct r5l_meta_block *mb;
2372                 int i;
2373                 int offset;
2374                 sector_t write_pos;
2375
2376                 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2377                 r5l_recovery_create_empty_meta_block(log, page,
2378                                                      ctx->pos, ctx->seq);
2379                 mb = page_address(page);
2380                 offset = le32_to_cpu(mb->meta_size);
2381                 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2382
2383                 for (i = sh->disks; i--; ) {
2384                         struct r5dev *dev = &sh->dev[i];
2385                         struct r5l_payload_data_parity *payload;
2386                         void *addr;
2387
2388                         if (test_bit(R5_InJournal, &dev->flags)) {
2389                                 payload = (void *)mb + offset;
2390                                 payload->header.type = cpu_to_le16(
2391                                         R5LOG_PAYLOAD_DATA);
2392                                 payload->size = cpu_to_le32(BLOCK_SECTORS);
2393                                 payload->location = cpu_to_le64(
2394                                         raid5_compute_blocknr(sh, i, 0));
2395                                 addr = kmap_atomic(dev->page);
2396                                 payload->checksum[0] = cpu_to_le32(
2397                                         crc32c_le(log->uuid_checksum, addr,
2398                                                   PAGE_SIZE));
2399                                 kunmap_atomic(addr);
2400                                 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2401                                              dev->page, REQ_OP_WRITE, 0, false);
2402                                 write_pos = r5l_ring_add(log, write_pos,
2403                                                          BLOCK_SECTORS);
2404                                 offset += sizeof(__le32) +
2405                                         sizeof(struct r5l_payload_data_parity);
2406
2407                         }
2408                 }
2409                 mb->meta_size = cpu_to_le32(offset);
2410                 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2411                                                      mb, PAGE_SIZE));
2412                 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2413                              REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
2414                 sh->log_start = ctx->pos;
2415                 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
2416                 atomic_inc(&log->stripe_in_journal_count);
2417                 ctx->pos = write_pos;
2418                 ctx->seq += 1;
2419                 next_checkpoint = sh->log_start;
2420         }
2421         log->next_checkpoint = next_checkpoint;
2422         __free_page(page);
2423         return 0;
2424 }
2425
2426 static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2427                                                  struct r5l_recovery_ctx *ctx)
2428 {
2429         struct mddev *mddev = log->rdev->mddev;
2430         struct r5conf *conf = mddev->private;
2431         struct stripe_head *sh, *next;
2432
2433         if (ctx->data_only_stripes == 0)
2434                 return;
2435
2436         log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2437
2438         list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2439                 r5c_make_stripe_write_out(sh);
2440                 set_bit(STRIPE_HANDLE, &sh->state);
2441                 list_del_init(&sh->lru);
2442                 raid5_release_stripe(sh);
2443         }
2444
2445         md_wakeup_thread(conf->mddev->thread);
2446         /* reuse conf->wait_for_quiescent in recovery */
2447         wait_event(conf->wait_for_quiescent,
2448                    atomic_read(&conf->active_stripes) == 0);
2449
2450         log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2451 }
2452
2453 static int r5l_recovery_log(struct r5l_log *log)
2454 {
2455         struct mddev *mddev = log->rdev->mddev;
2456         struct r5l_recovery_ctx *ctx;
2457         int ret;
2458         sector_t pos;
2459
2460         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2461         if (!ctx)
2462                 return -ENOMEM;
2463
2464         ctx->pos = log->last_checkpoint;
2465         ctx->seq = log->last_cp_seq;
2466         INIT_LIST_HEAD(&ctx->cached_list);
2467         ctx->meta_page = alloc_page(GFP_KERNEL);
2468
2469         if (!ctx->meta_page) {
2470                 ret =  -ENOMEM;
2471                 goto meta_page;
2472         }
2473
2474         if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
2475                 ret = -ENOMEM;
2476                 goto ra_pool;
2477         }
2478
2479         ret = r5c_recovery_flush_log(log, ctx);
2480
2481         if (ret)
2482                 goto error;
2483
2484         pos = ctx->pos;
2485         ctx->seq += 10000;
2486
2487         if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
2488                 pr_debug("md/raid:%s: starting from clean shutdown\n",
2489                          mdname(mddev));
2490         else
2491                 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2492                          mdname(mddev), ctx->data_only_stripes,
2493                          ctx->data_parity_stripes);
2494
2495         if (ctx->data_only_stripes == 0) {
2496                 log->next_checkpoint = ctx->pos;
2497                 r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
2498                 ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2499         } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
2500                 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2501                        mdname(mddev));
2502                 ret =  -EIO;
2503                 goto error;
2504         }
2505
2506         log->log_start = ctx->pos;
2507         log->seq = ctx->seq;
2508         log->last_checkpoint = pos;
2509         r5l_write_super(log, pos);
2510
2511         r5c_recovery_flush_data_only_stripes(log, ctx);
2512         ret = 0;
2513 error:
2514         r5l_recovery_free_ra_pool(log, ctx);
2515 ra_pool:
2516         __free_page(ctx->meta_page);
2517 meta_page:
2518         kfree(ctx);
2519         return ret;
2520 }
2521
2522 static void r5l_write_super(struct r5l_log *log, sector_t cp)
2523 {
2524         struct mddev *mddev = log->rdev->mddev;
2525
2526         log->rdev->journal_tail = cp;
2527         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2528 }
2529
2530 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2531 {
2532         struct r5conf *conf;
2533         int ret;
2534
2535         ret = mddev_lock(mddev);
2536         if (ret)
2537                 return ret;
2538
2539         conf = mddev->private;
2540         if (!conf || !conf->log) {
2541                 mddev_unlock(mddev);
2542                 return 0;
2543         }
2544
2545         switch (conf->log->r5c_journal_mode) {
2546         case R5C_JOURNAL_MODE_WRITE_THROUGH:
2547                 ret = snprintf(
2548                         page, PAGE_SIZE, "[%s] %s\n",
2549                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2550                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2551                 break;
2552         case R5C_JOURNAL_MODE_WRITE_BACK:
2553                 ret = snprintf(
2554                         page, PAGE_SIZE, "%s [%s]\n",
2555                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2556                         r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2557                 break;
2558         default:
2559                 ret = 0;
2560         }
2561         mddev_unlock(mddev);
2562         return ret;
2563 }
2564
2565 /*
2566  * Set journal cache mode on @mddev (external API initially needed by dm-raid).
2567  *
2568  * @mode as defined in 'enum r5c_journal_mode'.
2569  *
2570  */
2571 int r5c_journal_mode_set(struct mddev *mddev, int mode)
2572 {
2573         struct r5conf *conf;
2574         int err;
2575
2576         if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2577             mode > R5C_JOURNAL_MODE_WRITE_BACK)
2578                 return -EINVAL;
2579
2580         err = mddev_lock(mddev);
2581         if (err)
2582                 return err;
2583         conf = mddev->private;
2584         if (!conf || !conf->log) {
2585                 mddev_unlock(mddev);
2586                 return -ENODEV;
2587         }
2588
2589         if (raid5_calc_degraded(conf) > 0 &&
2590             mode == R5C_JOURNAL_MODE_WRITE_BACK) {
2591                 mddev_unlock(mddev);
2592                 return -EINVAL;
2593         }
2594
2595         mddev_suspend(mddev);
2596         conf->log->r5c_journal_mode = mode;
2597         mddev_resume(mddev);
2598         mddev_unlock(mddev);
2599
2600         pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2601                  mdname(mddev), mode, r5c_journal_mode_str[mode]);
2602         return 0;
2603 }
2604 EXPORT_SYMBOL(r5c_journal_mode_set);
2605
2606 static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2607                                       const char *page, size_t length)
2608 {
2609         int mode = ARRAY_SIZE(r5c_journal_mode_str);
2610         size_t len = length;
2611
2612         if (len < 2)
2613                 return -EINVAL;
2614
2615         if (page[len - 1] == '\n')
2616                 len--;
2617
2618         while (mode--)
2619                 if (strlen(r5c_journal_mode_str[mode]) == len &&
2620                     !strncmp(page, r5c_journal_mode_str[mode], len))
2621                         break;
2622
2623         return r5c_journal_mode_set(mddev, mode) ?: length;
2624 }
2625
2626 struct md_sysfs_entry
2627 r5c_journal_mode = __ATTR(journal_mode, 0644,
2628                           r5c_journal_mode_show, r5c_journal_mode_store);
2629
2630 /*
2631  * Try handle write operation in caching phase. This function should only
2632  * be called in write-back mode.
2633  *
2634  * If all outstanding writes can be handled in caching phase, returns 0
2635  * If writes requires write-out phase, call r5c_make_stripe_write_out()
2636  * and returns -EAGAIN
2637  */
2638 int r5c_try_caching_write(struct r5conf *conf,
2639                           struct stripe_head *sh,
2640                           struct stripe_head_state *s,
2641                           int disks)
2642 {
2643         struct r5l_log *log = conf->log;
2644         int i;
2645         struct r5dev *dev;
2646         int to_cache = 0;
2647         void **pslot;
2648         sector_t tree_index;
2649         int ret;
2650         uintptr_t refcount;
2651
2652         BUG_ON(!r5c_is_writeback(log));
2653
2654         if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2655                 /*
2656                  * There are two different scenarios here:
2657                  *  1. The stripe has some data cached, and it is sent to
2658                  *     write-out phase for reclaim
2659                  *  2. The stripe is clean, and this is the first write
2660                  *
2661                  * For 1, return -EAGAIN, so we continue with
2662                  * handle_stripe_dirtying().
2663                  *
2664                  * For 2, set STRIPE_R5C_CACHING and continue with caching
2665                  * write.
2666                  */
2667
2668                 /* case 1: anything injournal or anything in written */
2669                 if (s->injournal > 0 || s->written > 0)
2670                         return -EAGAIN;
2671                 /* case 2 */
2672                 set_bit(STRIPE_R5C_CACHING, &sh->state);
2673         }
2674
2675         /*
2676          * When run in degraded mode, array is set to write-through mode.
2677          * This check helps drain pending write safely in the transition to
2678          * write-through mode.
2679          *
2680          * When a stripe is syncing, the write is also handled in write
2681          * through mode.
2682          */
2683         if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
2684                 r5c_make_stripe_write_out(sh);
2685                 return -EAGAIN;
2686         }
2687
2688         for (i = disks; i--; ) {
2689                 dev = &sh->dev[i];
2690                 /* if non-overwrite, use writing-out phase */
2691                 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2692                     !test_bit(R5_InJournal, &dev->flags)) {
2693                         r5c_make_stripe_write_out(sh);
2694                         return -EAGAIN;
2695                 }
2696         }
2697
2698         /* if the stripe is not counted in big_stripe_tree, add it now */
2699         if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
2700             !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2701                 tree_index = r5c_tree_index(conf, sh->sector);
2702                 spin_lock(&log->tree_lock);
2703                 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2704                                                tree_index);
2705                 if (pslot) {
2706                         refcount = (uintptr_t)radix_tree_deref_slot_protected(
2707                                 pslot, &log->tree_lock) >>
2708                                 R5C_RADIX_COUNT_SHIFT;
2709                         radix_tree_replace_slot(
2710                                 &log->big_stripe_tree, pslot,
2711                                 (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
2712                 } else {
2713                         /*
2714                          * this radix_tree_insert can fail safely, so no
2715                          * need to call radix_tree_preload()
2716                          */
2717                         ret = radix_tree_insert(
2718                                 &log->big_stripe_tree, tree_index,
2719                                 (void *)(1 << R5C_RADIX_COUNT_SHIFT));
2720                         if (ret) {
2721                                 spin_unlock(&log->tree_lock);
2722                                 r5c_make_stripe_write_out(sh);
2723                                 return -EAGAIN;
2724                         }
2725                 }
2726                 spin_unlock(&log->tree_lock);
2727
2728                 /*
2729                  * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
2730                  * counted in the radix tree
2731                  */
2732                 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
2733                 atomic_inc(&conf->r5c_cached_partial_stripes);
2734         }
2735
2736         for (i = disks; i--; ) {
2737                 dev = &sh->dev[i];
2738                 if (dev->towrite) {
2739                         set_bit(R5_Wantwrite, &dev->flags);
2740                         set_bit(R5_Wantdrain, &dev->flags);
2741                         set_bit(R5_LOCKED, &dev->flags);
2742                         to_cache++;
2743                 }
2744         }
2745
2746         if (to_cache) {
2747                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2748                 /*
2749                  * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
2750                  * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
2751                  * r5c_handle_data_cached()
2752                  */
2753                 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2754         }
2755
2756         return 0;
2757 }
2758
2759 /*
2760  * free extra pages (orig_page) we allocated for prexor
2761  */
2762 void r5c_release_extra_page(struct stripe_head *sh)
2763 {
2764         struct r5conf *conf = sh->raid_conf;
2765         int i;
2766         bool using_disk_info_extra_page;
2767
2768         using_disk_info_extra_page =
2769                 sh->dev[0].orig_page == conf->disks[0].extra_page;
2770
2771         for (i = sh->disks; i--; )
2772                 if (sh->dev[i].page != sh->dev[i].orig_page) {
2773                         struct page *p = sh->dev[i].orig_page;
2774
2775                         sh->dev[i].orig_page = sh->dev[i].page;
2776                         clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2777
2778                         if (!using_disk_info_extra_page)
2779                                 put_page(p);
2780                 }
2781
2782         if (using_disk_info_extra_page) {
2783                 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2784                 md_wakeup_thread(conf->mddev->thread);
2785         }
2786 }
2787
2788 void r5c_use_extra_page(struct stripe_head *sh)
2789 {
2790         struct r5conf *conf = sh->raid_conf;
2791         int i;
2792         struct r5dev *dev;
2793
2794         for (i = sh->disks; i--; ) {
2795                 dev = &sh->dev[i];
2796                 if (dev->orig_page != dev->page)
2797                         put_page(dev->orig_page);
2798                 dev->orig_page = conf->disks[i].extra_page;
2799         }
2800 }
2801
2802 /*
2803  * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
2804  * stripe is committed to RAID disks.
2805  */
2806 void r5c_finish_stripe_write_out(struct r5conf *conf,
2807                                  struct stripe_head *sh,
2808                                  struct stripe_head_state *s)
2809 {
2810         struct r5l_log *log = conf->log;
2811         int i;
2812         int do_wakeup = 0;
2813         sector_t tree_index;
2814         void **pslot;
2815         uintptr_t refcount;
2816
2817         if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2818                 return;
2819
2820         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2821         clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2822
2823         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2824                 return;
2825
2826         for (i = sh->disks; i--; ) {
2827                 clear_bit(R5_InJournal, &sh->dev[i].flags);
2828                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2829                         do_wakeup = 1;
2830         }
2831
2832         /*
2833          * analyse_stripe() runs before r5c_finish_stripe_write_out(),
2834          * We updated R5_InJournal, so we also update s->injournal.
2835          */
2836         s->injournal = 0;
2837
2838         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2839                 if (atomic_dec_and_test(&conf->pending_full_writes))
2840                         md_wakeup_thread(conf->mddev->thread);
2841
2842         if (do_wakeup)
2843                 wake_up(&conf->wait_for_overlap);
2844
2845         spin_lock_irq(&log->stripe_in_journal_lock);
2846         list_del_init(&sh->r5c);
2847         spin_unlock_irq(&log->stripe_in_journal_lock);
2848         sh->log_start = MaxSector;
2849
2850         atomic_dec(&log->stripe_in_journal_count);
2851         r5c_update_log_state(log);
2852
2853         /* stop counting this stripe in big_stripe_tree */
2854         if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
2855             test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2856                 tree_index = r5c_tree_index(conf, sh->sector);
2857                 spin_lock(&log->tree_lock);
2858                 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2859                                                tree_index);
2860                 BUG_ON(pslot == NULL);
2861                 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2862                         pslot, &log->tree_lock) >>
2863                         R5C_RADIX_COUNT_SHIFT;
2864                 if (refcount == 1)
2865                         radix_tree_delete(&log->big_stripe_tree, tree_index);
2866                 else
2867                         radix_tree_replace_slot(
2868                                 &log->big_stripe_tree, pslot,
2869                                 (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
2870                 spin_unlock(&log->tree_lock);
2871         }
2872
2873         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
2874                 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
2875                 atomic_dec(&conf->r5c_flushing_partial_stripes);
2876                 atomic_dec(&conf->r5c_cached_partial_stripes);
2877         }
2878
2879         if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2880                 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
2881                 atomic_dec(&conf->r5c_flushing_full_stripes);
2882                 atomic_dec(&conf->r5c_cached_full_stripes);
2883         }
2884
2885         r5l_append_flush_payload(log, sh->sector);
2886         /* stripe is flused to raid disks, we can do resync now */
2887         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
2888                 set_bit(STRIPE_HANDLE, &sh->state);
2889 }
2890
2891 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
2892 {
2893         struct r5conf *conf = sh->raid_conf;
2894         int pages = 0;
2895         int reserve;
2896         int i;
2897         int ret = 0;
2898
2899         BUG_ON(!log);
2900
2901         for (i = 0; i < sh->disks; i++) {
2902                 void *addr;
2903
2904                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2905                         continue;
2906                 addr = kmap_atomic(sh->dev[i].page);
2907                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2908                                                     addr, PAGE_SIZE);
2909                 kunmap_atomic(addr);
2910                 pages++;
2911         }
2912         WARN_ON(pages == 0);
2913
2914         /*
2915          * The stripe must enter state machine again to call endio, so
2916          * don't delay.
2917          */
2918         clear_bit(STRIPE_DELAYED, &sh->state);
2919         atomic_inc(&sh->count);
2920
2921         mutex_lock(&log->io_mutex);
2922         /* meta + data */
2923         reserve = (1 + pages) << (PAGE_SHIFT - 9);
2924
2925         if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2926             sh->log_start == MaxSector)
2927                 r5l_add_no_space_stripe(log, sh);
2928         else if (!r5l_has_free_space(log, reserve)) {
2929                 if (sh->log_start == log->last_checkpoint)
2930                         BUG();
2931                 else
2932                         r5l_add_no_space_stripe(log, sh);
2933         } else {
2934                 ret = r5l_log_stripe(log, sh, pages, 0);
2935                 if (ret) {
2936                         spin_lock_irq(&log->io_list_lock);
2937                         list_add_tail(&sh->log_list, &log->no_mem_stripes);
2938                         spin_unlock_irq(&log->io_list_lock);
2939                 }
2940         }
2941
2942         mutex_unlock(&log->io_mutex);
2943         return 0;
2944 }
2945
2946 /* check whether this big stripe is in write back cache. */
2947 bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
2948 {
2949         struct r5l_log *log = conf->log;
2950         sector_t tree_index;
2951         void *slot;
2952
2953         if (!log)
2954                 return false;
2955
2956         WARN_ON_ONCE(!rcu_read_lock_held());
2957         tree_index = r5c_tree_index(conf, sect);
2958         slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
2959         return slot != NULL;
2960 }
2961
2962 static int r5l_load_log(struct r5l_log *log)
2963 {
2964         struct md_rdev *rdev = log->rdev;
2965         struct page *page;
2966         struct r5l_meta_block *mb;
2967         sector_t cp = log->rdev->journal_tail;
2968         u32 stored_crc, expected_crc;
2969         bool create_super = false;
2970         int ret = 0;
2971
2972         /* Make sure it's valid */
2973         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2974                 cp = 0;
2975         page = alloc_page(GFP_KERNEL);
2976         if (!page)
2977                 return -ENOMEM;
2978
2979         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
2980                 ret = -EIO;
2981                 goto ioerr;
2982         }
2983         mb = page_address(page);
2984
2985         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2986             mb->version != R5LOG_VERSION) {
2987                 create_super = true;
2988                 goto create;
2989         }
2990         stored_crc = le32_to_cpu(mb->checksum);
2991         mb->checksum = 0;
2992         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2993         if (stored_crc != expected_crc) {
2994                 create_super = true;
2995                 goto create;
2996         }
2997         if (le64_to_cpu(mb->position) != cp) {
2998                 create_super = true;
2999                 goto create;
3000         }
3001 create:
3002         if (create_super) {
3003                 log->last_cp_seq = prandom_u32();
3004                 cp = 0;
3005                 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
3006                 /*
3007                  * Make sure super points to correct address. Log might have
3008                  * data very soon. If super hasn't correct log tail address,
3009                  * recovery can't find the log
3010                  */
3011                 r5l_write_super(log, cp);
3012         } else
3013                 log->last_cp_seq = le64_to_cpu(mb->seq);
3014
3015         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
3016         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
3017         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
3018                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
3019         log->last_checkpoint = cp;
3020
3021         __free_page(page);
3022
3023         if (create_super) {
3024                 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
3025                 log->seq = log->last_cp_seq + 1;
3026                 log->next_checkpoint = cp;
3027         } else
3028                 ret = r5l_recovery_log(log);
3029
3030         r5c_update_log_state(log);
3031         return ret;
3032 ioerr:
3033         __free_page(page);
3034         return ret;
3035 }
3036
3037 void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
3038 {
3039         struct r5conf *conf = mddev->private;
3040         struct r5l_log *log = conf->log;
3041
3042         if (!log)
3043                 return;
3044
3045         if ((raid5_calc_degraded(conf) > 0 ||
3046              test_bit(Journal, &rdev->flags)) &&
3047             conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
3048                 schedule_work(&log->disable_writeback_work);
3049 }
3050
3051 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
3052 {
3053         struct request_queue *q = bdev_get_queue(rdev->bdev);
3054         struct r5l_log *log;
3055         char b[BDEVNAME_SIZE];
3056
3057         pr_debug("md/raid:%s: using device %s as journal\n",
3058                  mdname(conf->mddev), bdevname(rdev->bdev, b));
3059
3060         if (PAGE_SIZE != 4096)
3061                 return -EINVAL;
3062
3063         /*
3064          * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
3065          * raid_disks r5l_payload_data_parity.
3066          *
3067          * Write journal and cache does not work for very big array
3068          * (raid_disks > 203)
3069          */
3070         if (sizeof(struct r5l_meta_block) +
3071             ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
3072              conf->raid_disks) > PAGE_SIZE) {
3073                 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
3074                        mdname(conf->mddev), conf->raid_disks);
3075                 return -EINVAL;
3076         }
3077
3078         log = kzalloc(sizeof(*log), GFP_KERNEL);
3079         if (!log)
3080                 return -ENOMEM;
3081         log->rdev = rdev;
3082
3083         log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
3084
3085         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
3086                                        sizeof(rdev->mddev->uuid));
3087
3088         mutex_init(&log->io_mutex);
3089
3090         spin_lock_init(&log->io_list_lock);
3091         INIT_LIST_HEAD(&log->running_ios);
3092         INIT_LIST_HEAD(&log->io_end_ios);
3093         INIT_LIST_HEAD(&log->flushing_ios);
3094         INIT_LIST_HEAD(&log->finished_ios);
3095         bio_init(&log->flush_bio, NULL, 0);
3096
3097         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
3098         if (!log->io_kc)
3099                 goto io_kc;
3100
3101         log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
3102         if (!log->io_pool)
3103                 goto io_pool;
3104
3105         log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
3106         if (!log->bs)
3107                 goto io_bs;
3108
3109         log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
3110         if (!log->meta_pool)
3111                 goto out_mempool;
3112
3113         spin_lock_init(&log->tree_lock);
3114         INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
3115
3116         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
3117                                                  log->rdev->mddev, "reclaim");
3118         if (!log->reclaim_thread)
3119                 goto reclaim_thread;
3120         log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
3121
3122         init_waitqueue_head(&log->iounit_wait);
3123
3124         INIT_LIST_HEAD(&log->no_mem_stripes);
3125
3126         INIT_LIST_HEAD(&log->no_space_stripes);
3127         spin_lock_init(&log->no_space_stripes_lock);
3128
3129         INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
3130         INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
3131
3132         log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
3133         INIT_LIST_HEAD(&log->stripe_in_journal_list);
3134         spin_lock_init(&log->stripe_in_journal_lock);
3135         atomic_set(&log->stripe_in_journal_count, 0);
3136
3137         rcu_assign_pointer(conf->log, log);
3138
3139         if (r5l_load_log(log))
3140                 goto error;
3141
3142         set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
3143         return 0;
3144
3145 error:
3146         rcu_assign_pointer(conf->log, NULL);
3147         md_unregister_thread(&log->reclaim_thread);
3148 reclaim_thread:
3149         mempool_destroy(log->meta_pool);
3150 out_mempool:
3151         bioset_free(log->bs);
3152 io_bs:
3153         mempool_destroy(log->io_pool);
3154 io_pool:
3155         kmem_cache_destroy(log->io_kc);
3156 io_kc:
3157         kfree(log);
3158         return -EINVAL;
3159 }
3160
3161 void r5l_exit_log(struct r5conf *conf)
3162 {
3163         struct r5l_log *log = conf->log;
3164
3165         conf->log = NULL;
3166         synchronize_rcu();
3167
3168         flush_work(&log->disable_writeback_work);
3169         md_unregister_thread(&log->reclaim_thread);
3170         mempool_destroy(log->meta_pool);
3171         bioset_free(log->bs);
3172         mempool_destroy(log->io_pool);
3173         kmem_cache_destroy(log->io_kc);
3174         kfree(log);
3175 }