6b6c3e1deaa8e56e87db2cca1d8b4e0640de0519
[linux-2.6-microblaze.git] / drivers / md / dm-integrity.c
1 /*
2  * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
3  * Copyright (C) 2016-2017 Milan Broz
4  * Copyright (C) 2016-2017 Mikulas Patocka
5  *
6  * This file is released under the GPL.
7  */
8
9 #include <linux/compiler.h>
10 #include <linux/module.h>
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/vmalloc.h>
14 #include <linux/sort.h>
15 #include <linux/rbtree.h>
16 #include <linux/delay.h>
17 #include <linux/random.h>
18 #include <linux/reboot.h>
19 #include <crypto/hash.h>
20 #include <crypto/skcipher.h>
21 #include <linux/async_tx.h>
22 #include <linux/dm-bufio.h>
23
24 #define DM_MSG_PREFIX "integrity"
25
26 #define DEFAULT_INTERLEAVE_SECTORS      32768
27 #define DEFAULT_JOURNAL_SIZE_FACTOR     7
28 #define DEFAULT_SECTORS_PER_BITMAP_BIT  32768
29 #define DEFAULT_BUFFER_SECTORS          128
30 #define DEFAULT_JOURNAL_WATERMARK       50
31 #define DEFAULT_SYNC_MSEC               10000
32 #define DEFAULT_MAX_JOURNAL_SECTORS     131072
33 #define MIN_LOG2_INTERLEAVE_SECTORS     3
34 #define MAX_LOG2_INTERLEAVE_SECTORS     31
35 #define METADATA_WORKQUEUE_MAX_ACTIVE   16
36 #define RECALC_SECTORS                  8192
37 #define RECALC_WRITE_SUPER              16
38 #define BITMAP_BLOCK_SIZE               4096    /* don't change it */
39 #define BITMAP_FLUSH_INTERVAL           (10 * HZ)
40
41 /*
42  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
43  * so it should not be enabled in the official kernel
44  */
45 //#define DEBUG_PRINT
46 //#define INTERNAL_VERIFY
47
48 /*
49  * On disk structures
50  */
51
52 #define SB_MAGIC                        "integrt"
53 #define SB_VERSION_1                    1
54 #define SB_VERSION_2                    2
55 #define SB_VERSION_3                    3
56 #define SB_VERSION_4                    4
57 #define SB_SECTORS                      8
58 #define MAX_SECTORS_PER_BLOCK           8
59
60 struct superblock {
61         __u8 magic[8];
62         __u8 version;
63         __u8 log2_interleave_sectors;
64         __u16 integrity_tag_size;
65         __u32 journal_sections;
66         __u64 provided_data_sectors;    /* userspace uses this value */
67         __u32 flags;
68         __u8 log2_sectors_per_block;
69         __u8 log2_blocks_per_bitmap_bit;
70         __u8 pad[2];
71         __u64 recalc_sector;
72 };
73
74 #define SB_FLAG_HAVE_JOURNAL_MAC        0x1
75 #define SB_FLAG_RECALCULATING           0x2
76 #define SB_FLAG_DIRTY_BITMAP            0x4
77 #define SB_FLAG_FIXED_PADDING           0x8
78
79 #define JOURNAL_ENTRY_ROUNDUP           8
80
81 typedef __u64 commit_id_t;
82 #define JOURNAL_MAC_PER_SECTOR          8
83
84 struct journal_entry {
85         union {
86                 struct {
87                         __u32 sector_lo;
88                         __u32 sector_hi;
89                 } s;
90                 __u64 sector;
91         } u;
92         commit_id_t last_bytes[0];
93         /* __u8 tag[0]; */
94 };
95
96 #define journal_entry_tag(ic, je)               ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
97
98 #if BITS_PER_LONG == 64
99 #define journal_entry_set_sector(je, x)         do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
100 #else
101 #define journal_entry_set_sector(je, x)         do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
102 #endif
103 #define journal_entry_get_sector(je)            le64_to_cpu((je)->u.sector)
104 #define journal_entry_is_unused(je)             ((je)->u.s.sector_hi == cpu_to_le32(-1))
105 #define journal_entry_set_unused(je)            do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
106 #define journal_entry_is_inprogress(je)         ((je)->u.s.sector_hi == cpu_to_le32(-2))
107 #define journal_entry_set_inprogress(je)        do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
108
109 #define JOURNAL_BLOCK_SECTORS           8
110 #define JOURNAL_SECTOR_DATA             ((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
111 #define JOURNAL_MAC_SIZE                (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
112
113 struct journal_sector {
114         __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
115         __u8 mac[JOURNAL_MAC_PER_SECTOR];
116         commit_id_t commit_id;
117 };
118
119 #define MAX_TAG_SIZE                    (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
120
121 #define METADATA_PADDING_SECTORS        8
122
123 #define N_COMMIT_IDS                    4
124
125 static unsigned char prev_commit_seq(unsigned char seq)
126 {
127         return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
128 }
129
130 static unsigned char next_commit_seq(unsigned char seq)
131 {
132         return (seq + 1) % N_COMMIT_IDS;
133 }
134
135 /*
136  * In-memory structures
137  */
138
139 struct journal_node {
140         struct rb_node node;
141         sector_t sector;
142 };
143
144 struct alg_spec {
145         char *alg_string;
146         char *key_string;
147         __u8 *key;
148         unsigned key_size;
149 };
150
151 struct dm_integrity_c {
152         struct dm_dev *dev;
153         struct dm_dev *meta_dev;
154         unsigned tag_size;
155         __s8 log2_tag_size;
156         sector_t start;
157         mempool_t journal_io_mempool;
158         struct dm_io_client *io;
159         struct dm_bufio_client *bufio;
160         struct workqueue_struct *metadata_wq;
161         struct superblock *sb;
162         unsigned journal_pages;
163         unsigned n_bitmap_blocks;
164
165         struct page_list *journal;
166         struct page_list *journal_io;
167         struct page_list *journal_xor;
168         struct page_list *recalc_bitmap;
169         struct page_list *may_write_bitmap;
170         struct bitmap_block_status *bbs;
171         unsigned bitmap_flush_interval;
172         int synchronous_mode;
173         struct bio_list synchronous_bios;
174         struct delayed_work bitmap_flush_work;
175
176         struct crypto_skcipher *journal_crypt;
177         struct scatterlist **journal_scatterlist;
178         struct scatterlist **journal_io_scatterlist;
179         struct skcipher_request **sk_requests;
180
181         struct crypto_shash *journal_mac;
182
183         struct journal_node *journal_tree;
184         struct rb_root journal_tree_root;
185
186         sector_t provided_data_sectors;
187
188         unsigned short journal_entry_size;
189         unsigned char journal_entries_per_sector;
190         unsigned char journal_section_entries;
191         unsigned short journal_section_sectors;
192         unsigned journal_sections;
193         unsigned journal_entries;
194         sector_t data_device_sectors;
195         sector_t meta_device_sectors;
196         unsigned initial_sectors;
197         unsigned metadata_run;
198         __s8 log2_metadata_run;
199         __u8 log2_buffer_sectors;
200         __u8 sectors_per_block;
201         __u8 log2_blocks_per_bitmap_bit;
202
203         unsigned char mode;
204         int suspending;
205
206         int failed;
207
208         struct crypto_shash *internal_hash;
209
210         /* these variables are locked with endio_wait.lock */
211         struct rb_root in_progress;
212         struct list_head wait_list;
213         wait_queue_head_t endio_wait;
214         struct workqueue_struct *wait_wq;
215         struct workqueue_struct *offload_wq;
216
217         unsigned char commit_seq;
218         commit_id_t commit_ids[N_COMMIT_IDS];
219
220         unsigned committed_section;
221         unsigned n_committed_sections;
222
223         unsigned uncommitted_section;
224         unsigned n_uncommitted_sections;
225
226         unsigned free_section;
227         unsigned char free_section_entry;
228         unsigned free_sectors;
229
230         unsigned free_sectors_threshold;
231
232         struct workqueue_struct *commit_wq;
233         struct work_struct commit_work;
234
235         struct workqueue_struct *writer_wq;
236         struct work_struct writer_work;
237
238         struct workqueue_struct *recalc_wq;
239         struct work_struct recalc_work;
240         u8 *recalc_buffer;
241         u8 *recalc_tags;
242
243         struct bio_list flush_bio_list;
244
245         unsigned long autocommit_jiffies;
246         struct timer_list autocommit_timer;
247         unsigned autocommit_msec;
248
249         wait_queue_head_t copy_to_journal_wait;
250
251         struct completion crypto_backoff;
252
253         bool journal_uptodate;
254         bool just_formatted;
255         bool recalculate_flag;
256         bool fix_padding;
257
258         struct alg_spec internal_hash_alg;
259         struct alg_spec journal_crypt_alg;
260         struct alg_spec journal_mac_alg;
261
262         atomic64_t number_of_mismatches;
263
264         struct notifier_block reboot_notifier;
265 };
266
267 struct dm_integrity_range {
268         sector_t logical_sector;
269         sector_t n_sectors;
270         bool waiting;
271         union {
272                 struct rb_node node;
273                 struct {
274                         struct task_struct *task;
275                         struct list_head wait_entry;
276                 };
277         };
278 };
279
280 struct dm_integrity_io {
281         struct work_struct work;
282
283         struct dm_integrity_c *ic;
284         bool write;
285         bool fua;
286
287         struct dm_integrity_range range;
288
289         sector_t metadata_block;
290         unsigned metadata_offset;
291
292         atomic_t in_flight;
293         blk_status_t bi_status;
294
295         struct completion *completion;
296
297         struct gendisk *orig_bi_disk;
298         u8 orig_bi_partno;
299         bio_end_io_t *orig_bi_end_io;
300         struct bio_integrity_payload *orig_bi_integrity;
301         struct bvec_iter orig_bi_iter;
302 };
303
304 struct journal_completion {
305         struct dm_integrity_c *ic;
306         atomic_t in_flight;
307         struct completion comp;
308 };
309
310 struct journal_io {
311         struct dm_integrity_range range;
312         struct journal_completion *comp;
313 };
314
315 struct bitmap_block_status {
316         struct work_struct work;
317         struct dm_integrity_c *ic;
318         unsigned idx;
319         unsigned long *bitmap;
320         struct bio_list bio_queue;
321         spinlock_t bio_queue_lock;
322
323 };
324
325 static struct kmem_cache *journal_io_cache;
326
327 #define JOURNAL_IO_MEMPOOL      32
328
329 #ifdef DEBUG_PRINT
330 #define DEBUG_print(x, ...)     printk(KERN_DEBUG x, ##__VA_ARGS__)
331 static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
332 {
333         va_list args;
334         va_start(args, msg);
335         vprintk(msg, args);
336         va_end(args);
337         if (len)
338                 pr_cont(":");
339         while (len) {
340                 pr_cont(" %02x", *bytes);
341                 bytes++;
342                 len--;
343         }
344         pr_cont("\n");
345 }
346 #define DEBUG_bytes(bytes, len, msg, ...)       __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
347 #else
348 #define DEBUG_print(x, ...)                     do { } while (0)
349 #define DEBUG_bytes(bytes, len, msg, ...)       do { } while (0)
350 #endif
351
352 static void dm_integrity_prepare(struct request *rq)
353 {
354 }
355
356 static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes)
357 {
358 }
359
360 /*
361  * DM Integrity profile, protection is performed layer above (dm-crypt)
362  */
363 static const struct blk_integrity_profile dm_integrity_profile = {
364         .name                   = "DM-DIF-EXT-TAG",
365         .generate_fn            = NULL,
366         .verify_fn              = NULL,
367         .prepare_fn             = dm_integrity_prepare,
368         .complete_fn            = dm_integrity_complete,
369 };
370
371 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
372 static void integrity_bio_wait(struct work_struct *w);
373 static void dm_integrity_dtr(struct dm_target *ti);
374
375 static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
376 {
377         if (err == -EILSEQ)
378                 atomic64_inc(&ic->number_of_mismatches);
379         if (!cmpxchg(&ic->failed, 0, err))
380                 DMERR("Error on %s: %d", msg, err);
381 }
382
383 static int dm_integrity_failed(struct dm_integrity_c *ic)
384 {
385         return READ_ONCE(ic->failed);
386 }
387
388 static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
389                                           unsigned j, unsigned char seq)
390 {
391         /*
392          * Xor the number with section and sector, so that if a piece of
393          * journal is written at wrong place, it is detected.
394          */
395         return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
396 }
397
398 static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
399                                 sector_t *area, sector_t *offset)
400 {
401         if (!ic->meta_dev) {
402                 __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
403                 *area = data_sector >> log2_interleave_sectors;
404                 *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
405         } else {
406                 *area = 0;
407                 *offset = data_sector;
408         }
409 }
410
411 #define sector_to_block(ic, n)                                          \
412 do {                                                                    \
413         BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1));          \
414         (n) >>= (ic)->sb->log2_sectors_per_block;                       \
415 } while (0)
416
417 static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
418                                             sector_t offset, unsigned *metadata_offset)
419 {
420         __u64 ms;
421         unsigned mo;
422
423         ms = area << ic->sb->log2_interleave_sectors;
424         if (likely(ic->log2_metadata_run >= 0))
425                 ms += area << ic->log2_metadata_run;
426         else
427                 ms += area * ic->metadata_run;
428         ms >>= ic->log2_buffer_sectors;
429
430         sector_to_block(ic, offset);
431
432         if (likely(ic->log2_tag_size >= 0)) {
433                 ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
434                 mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
435         } else {
436                 ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
437                 mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
438         }
439         *metadata_offset = mo;
440         return ms;
441 }
442
443 static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
444 {
445         sector_t result;
446
447         if (ic->meta_dev)
448                 return offset;
449
450         result = area << ic->sb->log2_interleave_sectors;
451         if (likely(ic->log2_metadata_run >= 0))
452                 result += (area + 1) << ic->log2_metadata_run;
453         else
454                 result += (area + 1) * ic->metadata_run;
455
456         result += (sector_t)ic->initial_sectors + offset;
457         result += ic->start;
458
459         return result;
460 }
461
462 static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
463 {
464         if (unlikely(*sec_ptr >= ic->journal_sections))
465                 *sec_ptr -= ic->journal_sections;
466 }
467
468 static void sb_set_version(struct dm_integrity_c *ic)
469 {
470         if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
471                 ic->sb->version = SB_VERSION_4;
472         else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
473                 ic->sb->version = SB_VERSION_3;
474         else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
475                 ic->sb->version = SB_VERSION_2;
476         else
477                 ic->sb->version = SB_VERSION_1;
478 }
479
480 static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
481 {
482         struct dm_io_request io_req;
483         struct dm_io_region io_loc;
484
485         io_req.bi_op = op;
486         io_req.bi_op_flags = op_flags;
487         io_req.mem.type = DM_IO_KMEM;
488         io_req.mem.ptr.addr = ic->sb;
489         io_req.notify.fn = NULL;
490         io_req.client = ic->io;
491         io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
492         io_loc.sector = ic->start;
493         io_loc.count = SB_SECTORS;
494
495         if (op == REQ_OP_WRITE)
496                 sb_set_version(ic);
497
498         return dm_io(&io_req, 1, &io_loc, NULL);
499 }
500
501 #define BITMAP_OP_TEST_ALL_SET          0
502 #define BITMAP_OP_TEST_ALL_CLEAR        1
503 #define BITMAP_OP_SET                   2
504 #define BITMAP_OP_CLEAR                 3
505
506 static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
507                             sector_t sector, sector_t n_sectors, int mode)
508 {
509         unsigned long bit, end_bit, this_end_bit, page, end_page;
510         unsigned long *data;
511
512         if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
513                 DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
514                         (unsigned long long)sector,
515                         (unsigned long long)n_sectors,
516                         ic->sb->log2_sectors_per_block,
517                         ic->log2_blocks_per_bitmap_bit,
518                         mode);
519                 BUG();
520         }
521
522         if (unlikely(!n_sectors))
523                 return true;
524
525         bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
526         end_bit = (sector + n_sectors - 1) >>
527                 (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
528
529         page = bit / (PAGE_SIZE * 8);
530         bit %= PAGE_SIZE * 8;
531
532         end_page = end_bit / (PAGE_SIZE * 8);
533         end_bit %= PAGE_SIZE * 8;
534
535 repeat:
536         if (page < end_page) {
537                 this_end_bit = PAGE_SIZE * 8 - 1;
538         } else {
539                 this_end_bit = end_bit;
540         }
541
542         data = lowmem_page_address(bitmap[page].page);
543
544         if (mode == BITMAP_OP_TEST_ALL_SET) {
545                 while (bit <= this_end_bit) {
546                         if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
547                                 do {
548                                         if (data[bit / BITS_PER_LONG] != -1)
549                                                 return false;
550                                         bit += BITS_PER_LONG;
551                                 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
552                                 continue;
553                         }
554                         if (!test_bit(bit, data))
555                                 return false;
556                         bit++;
557                 }
558         } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
559                 while (bit <= this_end_bit) {
560                         if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
561                                 do {
562                                         if (data[bit / BITS_PER_LONG] != 0)
563                                                 return false;
564                                         bit += BITS_PER_LONG;
565                                 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
566                                 continue;
567                         }
568                         if (test_bit(bit, data))
569                                 return false;
570                         bit++;
571                 }
572         } else if (mode == BITMAP_OP_SET) {
573                 while (bit <= this_end_bit) {
574                         if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
575                                 do {
576                                         data[bit / BITS_PER_LONG] = -1;
577                                         bit += BITS_PER_LONG;
578                                 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
579                                 continue;
580                         }
581                         __set_bit(bit, data);
582                         bit++;
583                 }
584         } else if (mode == BITMAP_OP_CLEAR) {
585                 if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
586                         clear_page(data);
587                 else while (bit <= this_end_bit) {
588                         if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
589                                 do {
590                                         data[bit / BITS_PER_LONG] = 0;
591                                         bit += BITS_PER_LONG;
592                                 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
593                                 continue;
594                         }
595                         __clear_bit(bit, data);
596                         bit++;
597                 }
598         } else {
599                 BUG();
600         }
601
602         if (unlikely(page < end_page)) {
603                 bit = 0;
604                 page++;
605                 goto repeat;
606         }
607
608         return true;
609 }
610
611 static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
612 {
613         unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
614         unsigned i;
615
616         for (i = 0; i < n_bitmap_pages; i++) {
617                 unsigned long *dst_data = lowmem_page_address(dst[i].page);
618                 unsigned long *src_data = lowmem_page_address(src[i].page);
619                 copy_page(dst_data, src_data);
620         }
621 }
622
623 static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
624 {
625         unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
626         unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
627
628         BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
629         return &ic->bbs[bitmap_block];
630 }
631
632 static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
633                                  bool e, const char *function)
634 {
635 #if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
636         unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
637
638         if (unlikely(section >= ic->journal_sections) ||
639             unlikely(offset >= limit)) {
640                 DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
641                        function, section, offset, ic->journal_sections, limit);
642                 BUG();
643         }
644 #endif
645 }
646
647 static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
648                                unsigned *pl_index, unsigned *pl_offset)
649 {
650         unsigned sector;
651
652         access_journal_check(ic, section, offset, false, "page_list_location");
653
654         sector = section * ic->journal_section_sectors + offset;
655
656         *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
657         *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
658 }
659
660 static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
661                                                unsigned section, unsigned offset, unsigned *n_sectors)
662 {
663         unsigned pl_index, pl_offset;
664         char *va;
665
666         page_list_location(ic, section, offset, &pl_index, &pl_offset);
667
668         if (n_sectors)
669                 *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
670
671         va = lowmem_page_address(pl[pl_index].page);
672
673         return (struct journal_sector *)(va + pl_offset);
674 }
675
676 static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
677 {
678         return access_page_list(ic, ic->journal, section, offset, NULL);
679 }
680
681 static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
682 {
683         unsigned rel_sector, offset;
684         struct journal_sector *js;
685
686         access_journal_check(ic, section, n, true, "access_journal_entry");
687
688         rel_sector = n % JOURNAL_BLOCK_SECTORS;
689         offset = n / JOURNAL_BLOCK_SECTORS;
690
691         js = access_journal(ic, section, rel_sector);
692         return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
693 }
694
695 static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
696 {
697         n <<= ic->sb->log2_sectors_per_block;
698
699         n += JOURNAL_BLOCK_SECTORS;
700
701         access_journal_check(ic, section, n, false, "access_journal_data");
702
703         return access_journal(ic, section, n);
704 }
705
706 static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
707 {
708         SHASH_DESC_ON_STACK(desc, ic->journal_mac);
709         int r;
710         unsigned j, size;
711
712         desc->tfm = ic->journal_mac;
713
714         r = crypto_shash_init(desc);
715         if (unlikely(r)) {
716                 dm_integrity_io_error(ic, "crypto_shash_init", r);
717                 goto err;
718         }
719
720         for (j = 0; j < ic->journal_section_entries; j++) {
721                 struct journal_entry *je = access_journal_entry(ic, section, j);
722                 r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
723                 if (unlikely(r)) {
724                         dm_integrity_io_error(ic, "crypto_shash_update", r);
725                         goto err;
726                 }
727         }
728
729         size = crypto_shash_digestsize(ic->journal_mac);
730
731         if (likely(size <= JOURNAL_MAC_SIZE)) {
732                 r = crypto_shash_final(desc, result);
733                 if (unlikely(r)) {
734                         dm_integrity_io_error(ic, "crypto_shash_final", r);
735                         goto err;
736                 }
737                 memset(result + size, 0, JOURNAL_MAC_SIZE - size);
738         } else {
739                 __u8 digest[HASH_MAX_DIGESTSIZE];
740
741                 if (WARN_ON(size > sizeof(digest))) {
742                         dm_integrity_io_error(ic, "digest_size", -EINVAL);
743                         goto err;
744                 }
745                 r = crypto_shash_final(desc, digest);
746                 if (unlikely(r)) {
747                         dm_integrity_io_error(ic, "crypto_shash_final", r);
748                         goto err;
749                 }
750                 memcpy(result, digest, JOURNAL_MAC_SIZE);
751         }
752
753         return;
754 err:
755         memset(result, 0, JOURNAL_MAC_SIZE);
756 }
757
758 static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
759 {
760         __u8 result[JOURNAL_MAC_SIZE];
761         unsigned j;
762
763         if (!ic->journal_mac)
764                 return;
765
766         section_mac(ic, section, result);
767
768         for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
769                 struct journal_sector *js = access_journal(ic, section, j);
770
771                 if (likely(wr))
772                         memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
773                 else {
774                         if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
775                                 dm_integrity_io_error(ic, "journal mac", -EILSEQ);
776                 }
777         }
778 }
779
780 static void complete_journal_op(void *context)
781 {
782         struct journal_completion *comp = context;
783         BUG_ON(!atomic_read(&comp->in_flight));
784         if (likely(atomic_dec_and_test(&comp->in_flight)))
785                 complete(&comp->comp);
786 }
787
788 static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
789                         unsigned n_sections, struct journal_completion *comp)
790 {
791         struct async_submit_ctl submit;
792         size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
793         unsigned pl_index, pl_offset, section_index;
794         struct page_list *source_pl, *target_pl;
795
796         if (likely(encrypt)) {
797                 source_pl = ic->journal;
798                 target_pl = ic->journal_io;
799         } else {
800                 source_pl = ic->journal_io;
801                 target_pl = ic->journal;
802         }
803
804         page_list_location(ic, section, 0, &pl_index, &pl_offset);
805
806         atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
807
808         init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
809
810         section_index = pl_index;
811
812         do {
813                 size_t this_step;
814                 struct page *src_pages[2];
815                 struct page *dst_page;
816
817                 while (unlikely(pl_index == section_index)) {
818                         unsigned dummy;
819                         if (likely(encrypt))
820                                 rw_section_mac(ic, section, true);
821                         section++;
822                         n_sections--;
823                         if (!n_sections)
824                                 break;
825                         page_list_location(ic, section, 0, &section_index, &dummy);
826                 }
827
828                 this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
829                 dst_page = target_pl[pl_index].page;
830                 src_pages[0] = source_pl[pl_index].page;
831                 src_pages[1] = ic->journal_xor[pl_index].page;
832
833                 async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
834
835                 pl_index++;
836                 pl_offset = 0;
837                 n_bytes -= this_step;
838         } while (n_bytes);
839
840         BUG_ON(n_sections);
841
842         async_tx_issue_pending_all();
843 }
844
845 static void complete_journal_encrypt(struct crypto_async_request *req, int err)
846 {
847         struct journal_completion *comp = req->data;
848         if (unlikely(err)) {
849                 if (likely(err == -EINPROGRESS)) {
850                         complete(&comp->ic->crypto_backoff);
851                         return;
852                 }
853                 dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
854         }
855         complete_journal_op(comp);
856 }
857
858 static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
859 {
860         int r;
861         skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
862                                       complete_journal_encrypt, comp);
863         if (likely(encrypt))
864                 r = crypto_skcipher_encrypt(req);
865         else
866                 r = crypto_skcipher_decrypt(req);
867         if (likely(!r))
868                 return false;
869         if (likely(r == -EINPROGRESS))
870                 return true;
871         if (likely(r == -EBUSY)) {
872                 wait_for_completion(&comp->ic->crypto_backoff);
873                 reinit_completion(&comp->ic->crypto_backoff);
874                 return true;
875         }
876         dm_integrity_io_error(comp->ic, "encrypt", r);
877         return false;
878 }
879
880 static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
881                           unsigned n_sections, struct journal_completion *comp)
882 {
883         struct scatterlist **source_sg;
884         struct scatterlist **target_sg;
885
886         atomic_add(2, &comp->in_flight);
887
888         if (likely(encrypt)) {
889                 source_sg = ic->journal_scatterlist;
890                 target_sg = ic->journal_io_scatterlist;
891         } else {
892                 source_sg = ic->journal_io_scatterlist;
893                 target_sg = ic->journal_scatterlist;
894         }
895
896         do {
897                 struct skcipher_request *req;
898                 unsigned ivsize;
899                 char *iv;
900
901                 if (likely(encrypt))
902                         rw_section_mac(ic, section, true);
903
904                 req = ic->sk_requests[section];
905                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
906                 iv = req->iv;
907
908                 memcpy(iv, iv + ivsize, ivsize);
909
910                 req->src = source_sg[section];
911                 req->dst = target_sg[section];
912
913                 if (unlikely(do_crypt(encrypt, req, comp)))
914                         atomic_inc(&comp->in_flight);
915
916                 section++;
917                 n_sections--;
918         } while (n_sections);
919
920         atomic_dec(&comp->in_flight);
921         complete_journal_op(comp);
922 }
923
924 static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
925                             unsigned n_sections, struct journal_completion *comp)
926 {
927         if (ic->journal_xor)
928                 return xor_journal(ic, encrypt, section, n_sections, comp);
929         else
930                 return crypt_journal(ic, encrypt, section, n_sections, comp);
931 }
932
933 static void complete_journal_io(unsigned long error, void *context)
934 {
935         struct journal_completion *comp = context;
936         if (unlikely(error != 0))
937                 dm_integrity_io_error(comp->ic, "writing journal", -EIO);
938         complete_journal_op(comp);
939 }
940
941 static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
942                                unsigned sector, unsigned n_sectors, struct journal_completion *comp)
943 {
944         struct dm_io_request io_req;
945         struct dm_io_region io_loc;
946         unsigned pl_index, pl_offset;
947         int r;
948
949         if (unlikely(dm_integrity_failed(ic))) {
950                 if (comp)
951                         complete_journal_io(-1UL, comp);
952                 return;
953         }
954
955         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
956         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
957
958         io_req.bi_op = op;
959         io_req.bi_op_flags = op_flags;
960         io_req.mem.type = DM_IO_PAGE_LIST;
961         if (ic->journal_io)
962                 io_req.mem.ptr.pl = &ic->journal_io[pl_index];
963         else
964                 io_req.mem.ptr.pl = &ic->journal[pl_index];
965         io_req.mem.offset = pl_offset;
966         if (likely(comp != NULL)) {
967                 io_req.notify.fn = complete_journal_io;
968                 io_req.notify.context = comp;
969         } else {
970                 io_req.notify.fn = NULL;
971         }
972         io_req.client = ic->io;
973         io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
974         io_loc.sector = ic->start + SB_SECTORS + sector;
975         io_loc.count = n_sectors;
976
977         r = dm_io(&io_req, 1, &io_loc, NULL);
978         if (unlikely(r)) {
979                 dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
980                 if (comp) {
981                         WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
982                         complete_journal_io(-1UL, comp);
983                 }
984         }
985 }
986
987 static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
988                        unsigned n_sections, struct journal_completion *comp)
989 {
990         unsigned sector, n_sectors;
991
992         sector = section * ic->journal_section_sectors;
993         n_sectors = n_sections * ic->journal_section_sectors;
994
995         rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp);
996 }
997
998 static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
999 {
1000         struct journal_completion io_comp;
1001         struct journal_completion crypt_comp_1;
1002         struct journal_completion crypt_comp_2;
1003         unsigned i;
1004
1005         io_comp.ic = ic;
1006         init_completion(&io_comp.comp);
1007
1008         if (commit_start + commit_sections <= ic->journal_sections) {
1009                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
1010                 if (ic->journal_io) {
1011                         crypt_comp_1.ic = ic;
1012                         init_completion(&crypt_comp_1.comp);
1013                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
1014                         encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
1015                         wait_for_completion_io(&crypt_comp_1.comp);
1016                 } else {
1017                         for (i = 0; i < commit_sections; i++)
1018                                 rw_section_mac(ic, commit_start + i, true);
1019                 }
1020                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start,
1021                            commit_sections, &io_comp);
1022         } else {
1023                 unsigned to_end;
1024                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
1025                 to_end = ic->journal_sections - commit_start;
1026                 if (ic->journal_io) {
1027                         crypt_comp_1.ic = ic;
1028                         init_completion(&crypt_comp_1.comp);
1029                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
1030                         encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
1031                         if (try_wait_for_completion(&crypt_comp_1.comp)) {
1032                                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
1033                                 reinit_completion(&crypt_comp_1.comp);
1034                                 crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
1035                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
1036                                 wait_for_completion_io(&crypt_comp_1.comp);
1037                         } else {
1038                                 crypt_comp_2.ic = ic;
1039                                 init_completion(&crypt_comp_2.comp);
1040                                 crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
1041                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
1042                                 wait_for_completion_io(&crypt_comp_1.comp);
1043                                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
1044                                 wait_for_completion_io(&crypt_comp_2.comp);
1045                         }
1046                 } else {
1047                         for (i = 0; i < to_end; i++)
1048                                 rw_section_mac(ic, commit_start + i, true);
1049                         rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
1050                         for (i = 0; i < commit_sections - to_end; i++)
1051                                 rw_section_mac(ic, i, true);
1052                 }
1053                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
1054         }
1055
1056         wait_for_completion_io(&io_comp.comp);
1057 }
1058
1059 static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
1060                               unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
1061 {
1062         struct dm_io_request io_req;
1063         struct dm_io_region io_loc;
1064         int r;
1065         unsigned sector, pl_index, pl_offset;
1066
1067         BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
1068
1069         if (unlikely(dm_integrity_failed(ic))) {
1070                 fn(-1UL, data);
1071                 return;
1072         }
1073
1074         sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
1075
1076         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
1077         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
1078
1079         io_req.bi_op = REQ_OP_WRITE;
1080         io_req.bi_op_flags = 0;
1081         io_req.mem.type = DM_IO_PAGE_LIST;
1082         io_req.mem.ptr.pl = &ic->journal[pl_index];
1083         io_req.mem.offset = pl_offset;
1084         io_req.notify.fn = fn;
1085         io_req.notify.context = data;
1086         io_req.client = ic->io;
1087         io_loc.bdev = ic->dev->bdev;
1088         io_loc.sector = target;
1089         io_loc.count = n_sectors;
1090
1091         r = dm_io(&io_req, 1, &io_loc, NULL);
1092         if (unlikely(r)) {
1093                 WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
1094                 fn(-1UL, data);
1095         }
1096 }
1097
1098 static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
1099 {
1100         return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
1101                range1->logical_sector + range1->n_sectors > range2->logical_sector;
1102 }
1103
1104 static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
1105 {
1106         struct rb_node **n = &ic->in_progress.rb_node;
1107         struct rb_node *parent;
1108
1109         BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
1110
1111         if (likely(check_waiting)) {
1112                 struct dm_integrity_range *range;
1113                 list_for_each_entry(range, &ic->wait_list, wait_entry) {
1114                         if (unlikely(ranges_overlap(range, new_range)))
1115                                 return false;
1116                 }
1117         }
1118
1119         parent = NULL;
1120
1121         while (*n) {
1122                 struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
1123
1124                 parent = *n;
1125                 if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
1126                         n = &range->node.rb_left;
1127                 } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
1128                         n = &range->node.rb_right;
1129                 } else {
1130                         return false;
1131                 }
1132         }
1133
1134         rb_link_node(&new_range->node, parent, n);
1135         rb_insert_color(&new_range->node, &ic->in_progress);
1136
1137         return true;
1138 }
1139
1140 static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
1141 {
1142         rb_erase(&range->node, &ic->in_progress);
1143         while (unlikely(!list_empty(&ic->wait_list))) {
1144                 struct dm_integrity_range *last_range =
1145                         list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
1146                 struct task_struct *last_range_task;
1147                 last_range_task = last_range->task;
1148                 list_del(&last_range->wait_entry);
1149                 if (!add_new_range(ic, last_range, false)) {
1150                         last_range->task = last_range_task;
1151                         list_add(&last_range->wait_entry, &ic->wait_list);
1152                         break;
1153                 }
1154                 last_range->waiting = false;
1155                 wake_up_process(last_range_task);
1156         }
1157 }
1158
1159 static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
1160 {
1161         unsigned long flags;
1162
1163         spin_lock_irqsave(&ic->endio_wait.lock, flags);
1164         remove_range_unlocked(ic, range);
1165         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1166 }
1167
1168 static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
1169 {
1170         new_range->waiting = true;
1171         list_add_tail(&new_range->wait_entry, &ic->wait_list);
1172         new_range->task = current;
1173         do {
1174                 __set_current_state(TASK_UNINTERRUPTIBLE);
1175                 spin_unlock_irq(&ic->endio_wait.lock);
1176                 io_schedule();
1177                 spin_lock_irq(&ic->endio_wait.lock);
1178         } while (unlikely(new_range->waiting));
1179 }
1180
1181 static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
1182 {
1183         if (unlikely(!add_new_range(ic, new_range, true)))
1184                 wait_and_add_new_range(ic, new_range);
1185 }
1186
1187 static void init_journal_node(struct journal_node *node)
1188 {
1189         RB_CLEAR_NODE(&node->node);
1190         node->sector = (sector_t)-1;
1191 }
1192
1193 static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
1194 {
1195         struct rb_node **link;
1196         struct rb_node *parent;
1197
1198         node->sector = sector;
1199         BUG_ON(!RB_EMPTY_NODE(&node->node));
1200
1201         link = &ic->journal_tree_root.rb_node;
1202         parent = NULL;
1203
1204         while (*link) {
1205                 struct journal_node *j;
1206                 parent = *link;
1207                 j = container_of(parent, struct journal_node, node);
1208                 if (sector < j->sector)
1209                         link = &j->node.rb_left;
1210                 else
1211                         link = &j->node.rb_right;
1212         }
1213
1214         rb_link_node(&node->node, parent, link);
1215         rb_insert_color(&node->node, &ic->journal_tree_root);
1216 }
1217
1218 static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
1219 {
1220         BUG_ON(RB_EMPTY_NODE(&node->node));
1221         rb_erase(&node->node, &ic->journal_tree_root);
1222         init_journal_node(node);
1223 }
1224
1225 #define NOT_FOUND       (-1U)
1226
1227 static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
1228 {
1229         struct rb_node *n = ic->journal_tree_root.rb_node;
1230         unsigned found = NOT_FOUND;
1231         *next_sector = (sector_t)-1;
1232         while (n) {
1233                 struct journal_node *j = container_of(n, struct journal_node, node);
1234                 if (sector == j->sector) {
1235                         found = j - ic->journal_tree;
1236                 }
1237                 if (sector < j->sector) {
1238                         *next_sector = j->sector;
1239                         n = j->node.rb_left;
1240                 } else {
1241                         n = j->node.rb_right;
1242                 }
1243         }
1244
1245         return found;
1246 }
1247
1248 static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
1249 {
1250         struct journal_node *node, *next_node;
1251         struct rb_node *next;
1252
1253         if (unlikely(pos >= ic->journal_entries))
1254                 return false;
1255         node = &ic->journal_tree[pos];
1256         if (unlikely(RB_EMPTY_NODE(&node->node)))
1257                 return false;
1258         if (unlikely(node->sector != sector))
1259                 return false;
1260
1261         next = rb_next(&node->node);
1262         if (unlikely(!next))
1263                 return true;
1264
1265         next_node = container_of(next, struct journal_node, node);
1266         return next_node->sector != sector;
1267 }
1268
1269 static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
1270 {
1271         struct rb_node *next;
1272         struct journal_node *next_node;
1273         unsigned next_section;
1274
1275         BUG_ON(RB_EMPTY_NODE(&node->node));
1276
1277         next = rb_next(&node->node);
1278         if (unlikely(!next))
1279                 return false;
1280
1281         next_node = container_of(next, struct journal_node, node);
1282
1283         if (next_node->sector != node->sector)
1284                 return false;
1285
1286         next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
1287         if (next_section >= ic->committed_section &&
1288             next_section < ic->committed_section + ic->n_committed_sections)
1289                 return true;
1290         if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
1291                 return true;
1292
1293         return false;
1294 }
1295
1296 #define TAG_READ        0
1297 #define TAG_WRITE       1
1298 #define TAG_CMP         2
1299
1300 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
1301                                unsigned *metadata_offset, unsigned total_size, int op)
1302 {
1303         do {
1304                 unsigned char *data, *dp;
1305                 struct dm_buffer *b;
1306                 unsigned to_copy;
1307                 int r;
1308
1309                 r = dm_integrity_failed(ic);
1310                 if (unlikely(r))
1311                         return r;
1312
1313                 data = dm_bufio_read(ic->bufio, *metadata_block, &b);
1314                 if (IS_ERR(data))
1315                         return PTR_ERR(data);
1316
1317                 to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
1318                 dp = data + *metadata_offset;
1319                 if (op == TAG_READ) {
1320                         memcpy(tag, dp, to_copy);
1321                 } else if (op == TAG_WRITE) {
1322                         memcpy(dp, tag, to_copy);
1323                         dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
1324                 } else  {
1325                         /* e.g.: op == TAG_CMP */
1326                         if (unlikely(memcmp(dp, tag, to_copy))) {
1327                                 unsigned i;
1328
1329                                 for (i = 0; i < to_copy; i++) {
1330                                         if (dp[i] != tag[i])
1331                                                 break;
1332                                         total_size--;
1333                                 }
1334                                 dm_bufio_release(b);
1335                                 return total_size;
1336                         }
1337                 }
1338                 dm_bufio_release(b);
1339
1340                 tag += to_copy;
1341                 *metadata_offset += to_copy;
1342                 if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
1343                         (*metadata_block)++;
1344                         *metadata_offset = 0;
1345                 }
1346                 total_size -= to_copy;
1347         } while (unlikely(total_size));
1348
1349         return 0;
1350 }
1351
1352 static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
1353 {
1354         int r;
1355         r = dm_bufio_write_dirty_buffers(ic->bufio);
1356         if (unlikely(r))
1357                 dm_integrity_io_error(ic, "writing tags", r);
1358 }
1359
1360 static void sleep_on_endio_wait(struct dm_integrity_c *ic)
1361 {
1362         DECLARE_WAITQUEUE(wait, current);
1363         __add_wait_queue(&ic->endio_wait, &wait);
1364         __set_current_state(TASK_UNINTERRUPTIBLE);
1365         spin_unlock_irq(&ic->endio_wait.lock);
1366         io_schedule();
1367         spin_lock_irq(&ic->endio_wait.lock);
1368         __remove_wait_queue(&ic->endio_wait, &wait);
1369 }
1370
1371 static void autocommit_fn(struct timer_list *t)
1372 {
1373         struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer);
1374
1375         if (likely(!dm_integrity_failed(ic)))
1376                 queue_work(ic->commit_wq, &ic->commit_work);
1377 }
1378
1379 static void schedule_autocommit(struct dm_integrity_c *ic)
1380 {
1381         if (!timer_pending(&ic->autocommit_timer))
1382                 mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
1383 }
1384
1385 static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1386 {
1387         struct bio *bio;
1388         unsigned long flags;
1389
1390         spin_lock_irqsave(&ic->endio_wait.lock, flags);
1391         bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1392         bio_list_add(&ic->flush_bio_list, bio);
1393         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1394
1395         queue_work(ic->commit_wq, &ic->commit_work);
1396 }
1397
1398 static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
1399 {
1400         int r = dm_integrity_failed(ic);
1401         if (unlikely(r) && !bio->bi_status)
1402                 bio->bi_status = errno_to_blk_status(r);
1403         if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
1404                 unsigned long flags;
1405                 spin_lock_irqsave(&ic->endio_wait.lock, flags);
1406                 bio_list_add(&ic->synchronous_bios, bio);
1407                 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
1408                 spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1409                 return;
1410         }
1411         bio_endio(bio);
1412 }
1413
1414 static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1415 {
1416         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1417
1418         if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
1419                 submit_flush_bio(ic, dio);
1420         else
1421                 do_endio(ic, bio);
1422 }
1423
1424 static void dec_in_flight(struct dm_integrity_io *dio)
1425 {
1426         if (atomic_dec_and_test(&dio->in_flight)) {
1427                 struct dm_integrity_c *ic = dio->ic;
1428                 struct bio *bio;
1429
1430                 remove_range(ic, &dio->range);
1431
1432                 if (unlikely(dio->write))
1433                         schedule_autocommit(ic);
1434
1435                 bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1436
1437                 if (unlikely(dio->bi_status) && !bio->bi_status)
1438                         bio->bi_status = dio->bi_status;
1439                 if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
1440                         dio->range.logical_sector += dio->range.n_sectors;
1441                         bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
1442                         INIT_WORK(&dio->work, integrity_bio_wait);
1443                         queue_work(ic->offload_wq, &dio->work);
1444                         return;
1445                 }
1446                 do_endio_flush(ic, dio);
1447         }
1448 }
1449
1450 static void integrity_end_io(struct bio *bio)
1451 {
1452         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1453
1454         bio->bi_iter = dio->orig_bi_iter;
1455         bio->bi_disk = dio->orig_bi_disk;
1456         bio->bi_partno = dio->orig_bi_partno;
1457         if (dio->orig_bi_integrity) {
1458                 bio->bi_integrity = dio->orig_bi_integrity;
1459                 bio->bi_opf |= REQ_INTEGRITY;
1460         }
1461         bio->bi_end_io = dio->orig_bi_end_io;
1462
1463         if (dio->completion)
1464                 complete(dio->completion);
1465
1466         dec_in_flight(dio);
1467 }
1468
1469 static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
1470                                       const char *data, char *result)
1471 {
1472         __u64 sector_le = cpu_to_le64(sector);
1473         SHASH_DESC_ON_STACK(req, ic->internal_hash);
1474         int r;
1475         unsigned digest_size;
1476
1477         req->tfm = ic->internal_hash;
1478
1479         r = crypto_shash_init(req);
1480         if (unlikely(r < 0)) {
1481                 dm_integrity_io_error(ic, "crypto_shash_init", r);
1482                 goto failed;
1483         }
1484
1485         r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
1486         if (unlikely(r < 0)) {
1487                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1488                 goto failed;
1489         }
1490
1491         r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
1492         if (unlikely(r < 0)) {
1493                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1494                 goto failed;
1495         }
1496
1497         r = crypto_shash_final(req, result);
1498         if (unlikely(r < 0)) {
1499                 dm_integrity_io_error(ic, "crypto_shash_final", r);
1500                 goto failed;
1501         }
1502
1503         digest_size = crypto_shash_digestsize(ic->internal_hash);
1504         if (unlikely(digest_size < ic->tag_size))
1505                 memset(result + digest_size, 0, ic->tag_size - digest_size);
1506
1507         return;
1508
1509 failed:
1510         /* this shouldn't happen anyway, the hash functions have no reason to fail */
1511         get_random_bytes(result, ic->tag_size);
1512 }
1513
1514 static void integrity_metadata(struct work_struct *w)
1515 {
1516         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1517         struct dm_integrity_c *ic = dio->ic;
1518
1519         int r;
1520
1521         if (ic->internal_hash) {
1522                 struct bvec_iter iter;
1523                 struct bio_vec bv;
1524                 unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1525                 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1526                 char *checksums;
1527                 unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
1528                 char checksums_onstack[HASH_MAX_DIGESTSIZE];
1529                 unsigned sectors_to_process = dio->range.n_sectors;
1530                 sector_t sector = dio->range.logical_sector;
1531
1532                 if (unlikely(ic->mode == 'R'))
1533                         goto skip_io;
1534
1535                 checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
1536                                     GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
1537                 if (!checksums) {
1538                         checksums = checksums_onstack;
1539                         if (WARN_ON(extra_space &&
1540                                     digest_size > sizeof(checksums_onstack))) {
1541                                 r = -EINVAL;
1542                                 goto error;
1543                         }
1544                 }
1545
1546                 __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
1547                         unsigned pos;
1548                         char *mem, *checksums_ptr;
1549
1550 again:
1551                         mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
1552                         pos = 0;
1553                         checksums_ptr = checksums;
1554                         do {
1555                                 integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
1556                                 checksums_ptr += ic->tag_size;
1557                                 sectors_to_process -= ic->sectors_per_block;
1558                                 pos += ic->sectors_per_block << SECTOR_SHIFT;
1559                                 sector += ic->sectors_per_block;
1560                         } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
1561                         kunmap_atomic(mem);
1562
1563                         r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
1564                                                 checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
1565                         if (unlikely(r)) {
1566                                 if (r > 0) {
1567                                         DMERR_LIMIT("Checksum failed at sector 0x%llx",
1568                                                     (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
1569                                         r = -EILSEQ;
1570                                         atomic64_inc(&ic->number_of_mismatches);
1571                                 }
1572                                 if (likely(checksums != checksums_onstack))
1573                                         kfree(checksums);
1574                                 goto error;
1575                         }
1576
1577                         if (!sectors_to_process)
1578                                 break;
1579
1580                         if (unlikely(pos < bv.bv_len)) {
1581                                 bv.bv_offset += pos;
1582                                 bv.bv_len -= pos;
1583                                 goto again;
1584                         }
1585                 }
1586
1587                 if (likely(checksums != checksums_onstack))
1588                         kfree(checksums);
1589         } else {
1590                 struct bio_integrity_payload *bip = dio->orig_bi_integrity;
1591
1592                 if (bip) {
1593                         struct bio_vec biv;
1594                         struct bvec_iter iter;
1595                         unsigned data_to_process = dio->range.n_sectors;
1596                         sector_to_block(ic, data_to_process);
1597                         data_to_process *= ic->tag_size;
1598
1599                         bip_for_each_vec(biv, bip, iter) {
1600                                 unsigned char *tag;
1601                                 unsigned this_len;
1602
1603                                 BUG_ON(PageHighMem(biv.bv_page));
1604                                 tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1605                                 this_len = min(biv.bv_len, data_to_process);
1606                                 r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
1607                                                         this_len, !dio->write ? TAG_READ : TAG_WRITE);
1608                                 if (unlikely(r))
1609                                         goto error;
1610                                 data_to_process -= this_len;
1611                                 if (!data_to_process)
1612                                         break;
1613                         }
1614                 }
1615         }
1616 skip_io:
1617         dec_in_flight(dio);
1618         return;
1619 error:
1620         dio->bi_status = errno_to_blk_status(r);
1621         dec_in_flight(dio);
1622 }
1623
1624 static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
1625 {
1626         struct dm_integrity_c *ic = ti->private;
1627         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1628         struct bio_integrity_payload *bip;
1629
1630         sector_t area, offset;
1631
1632         dio->ic = ic;
1633         dio->bi_status = 0;
1634
1635         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1636                 submit_flush_bio(ic, dio);
1637                 return DM_MAPIO_SUBMITTED;
1638         }
1639
1640         dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1641         dio->write = bio_op(bio) == REQ_OP_WRITE;
1642         dio->fua = dio->write && bio->bi_opf & REQ_FUA;
1643         if (unlikely(dio->fua)) {
1644                 /*
1645                  * Don't pass down the FUA flag because we have to flush
1646                  * disk cache anyway.
1647                  */
1648                 bio->bi_opf &= ~REQ_FUA;
1649         }
1650         if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
1651                 DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
1652                       (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
1653                       (unsigned long long)ic->provided_data_sectors);
1654                 return DM_MAPIO_KILL;
1655         }
1656         if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
1657                 DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
1658                       ic->sectors_per_block,
1659                       (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
1660                 return DM_MAPIO_KILL;
1661         }
1662
1663         if (ic->sectors_per_block > 1) {
1664                 struct bvec_iter iter;
1665                 struct bio_vec bv;
1666                 bio_for_each_segment(bv, bio, iter) {
1667                         if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
1668                                 DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
1669                                         bv.bv_offset, bv.bv_len, ic->sectors_per_block);
1670                                 return DM_MAPIO_KILL;
1671                         }
1672                 }
1673         }
1674
1675         bip = bio_integrity(bio);
1676         if (!ic->internal_hash) {
1677                 if (bip) {
1678                         unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
1679                         if (ic->log2_tag_size >= 0)
1680                                 wanted_tag_size <<= ic->log2_tag_size;
1681                         else
1682                                 wanted_tag_size *= ic->tag_size;
1683                         if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
1684                                 DMERR("Invalid integrity data size %u, expected %u",
1685                                       bip->bip_iter.bi_size, wanted_tag_size);
1686                                 return DM_MAPIO_KILL;
1687                         }
1688                 }
1689         } else {
1690                 if (unlikely(bip != NULL)) {
1691                         DMERR("Unexpected integrity data when using internal hash");
1692                         return DM_MAPIO_KILL;
1693                 }
1694         }
1695
1696         if (unlikely(ic->mode == 'R') && unlikely(dio->write))
1697                 return DM_MAPIO_KILL;
1698
1699         get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1700         dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1701         bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
1702
1703         dm_integrity_map_continue(dio, true);
1704         return DM_MAPIO_SUBMITTED;
1705 }
1706
1707 static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
1708                                  unsigned journal_section, unsigned journal_entry)
1709 {
1710         struct dm_integrity_c *ic = dio->ic;
1711         sector_t logical_sector;
1712         unsigned n_sectors;
1713
1714         logical_sector = dio->range.logical_sector;
1715         n_sectors = dio->range.n_sectors;
1716         do {
1717                 struct bio_vec bv = bio_iovec(bio);
1718                 char *mem;
1719
1720                 if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
1721                         bv.bv_len = n_sectors << SECTOR_SHIFT;
1722                 n_sectors -= bv.bv_len >> SECTOR_SHIFT;
1723                 bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
1724 retry_kmap:
1725                 mem = kmap_atomic(bv.bv_page);
1726                 if (likely(dio->write))
1727                         flush_dcache_page(bv.bv_page);
1728
1729                 do {
1730                         struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
1731
1732                         if (unlikely(!dio->write)) {
1733                                 struct journal_sector *js;
1734                                 char *mem_ptr;
1735                                 unsigned s;
1736
1737                                 if (unlikely(journal_entry_is_inprogress(je))) {
1738                                         flush_dcache_page(bv.bv_page);
1739                                         kunmap_atomic(mem);
1740
1741                                         __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
1742                                         goto retry_kmap;
1743                                 }
1744                                 smp_rmb();
1745                                 BUG_ON(journal_entry_get_sector(je) != logical_sector);
1746                                 js = access_journal_data(ic, journal_section, journal_entry);
1747                                 mem_ptr = mem + bv.bv_offset;
1748                                 s = 0;
1749                                 do {
1750                                         memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
1751                                         *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
1752                                         js++;
1753                                         mem_ptr += 1 << SECTOR_SHIFT;
1754                                 } while (++s < ic->sectors_per_block);
1755 #ifdef INTERNAL_VERIFY
1756                                 if (ic->internal_hash) {
1757                                         char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
1758
1759                                         integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
1760                                         if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
1761                                                 DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
1762                                                             (unsigned long long)logical_sector);
1763                                         }
1764                                 }
1765 #endif
1766                         }
1767
1768                         if (!ic->internal_hash) {
1769                                 struct bio_integrity_payload *bip = bio_integrity(bio);
1770                                 unsigned tag_todo = ic->tag_size;
1771                                 char *tag_ptr = journal_entry_tag(ic, je);
1772
1773                                 if (bip) do {
1774                                         struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
1775                                         unsigned tag_now = min(biv.bv_len, tag_todo);
1776                                         char *tag_addr;
1777                                         BUG_ON(PageHighMem(biv.bv_page));
1778                                         tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1779                                         if (likely(dio->write))
1780                                                 memcpy(tag_ptr, tag_addr, tag_now);
1781                                         else
1782                                                 memcpy(tag_addr, tag_ptr, tag_now);
1783                                         bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
1784                                         tag_ptr += tag_now;
1785                                         tag_todo -= tag_now;
1786                                 } while (unlikely(tag_todo)); else {
1787                                         if (likely(dio->write))
1788                                                 memset(tag_ptr, 0, tag_todo);
1789                                 }
1790                         }
1791
1792                         if (likely(dio->write)) {
1793                                 struct journal_sector *js;
1794                                 unsigned s;
1795
1796                                 js = access_journal_data(ic, journal_section, journal_entry);
1797                                 memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
1798
1799                                 s = 0;
1800                                 do {
1801                                         je->last_bytes[s] = js[s].commit_id;
1802                                 } while (++s < ic->sectors_per_block);
1803
1804                                 if (ic->internal_hash) {
1805                                         unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1806                                         if (unlikely(digest_size > ic->tag_size)) {
1807                                                 char checksums_onstack[HASH_MAX_DIGESTSIZE];
1808                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
1809                                                 memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
1810                                         } else
1811                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
1812                                 }
1813
1814                                 journal_entry_set_sector(je, logical_sector);
1815                         }
1816                         logical_sector += ic->sectors_per_block;
1817
1818                         journal_entry++;
1819                         if (unlikely(journal_entry == ic->journal_section_entries)) {
1820                                 journal_entry = 0;
1821                                 journal_section++;
1822                                 wraparound_section(ic, &journal_section);
1823                         }
1824
1825                         bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
1826                 } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
1827
1828                 if (unlikely(!dio->write))
1829                         flush_dcache_page(bv.bv_page);
1830                 kunmap_atomic(mem);
1831         } while (n_sectors);
1832
1833         if (likely(dio->write)) {
1834                 smp_mb();
1835                 if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
1836                         wake_up(&ic->copy_to_journal_wait);
1837                 if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
1838                         queue_work(ic->commit_wq, &ic->commit_work);
1839                 } else {
1840                         schedule_autocommit(ic);
1841                 }
1842         } else {
1843                 remove_range(ic, &dio->range);
1844         }
1845
1846         if (unlikely(bio->bi_iter.bi_size)) {
1847                 sector_t area, offset;
1848
1849                 dio->range.logical_sector = logical_sector;
1850                 get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1851                 dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1852                 return true;
1853         }
1854
1855         return false;
1856 }
1857
1858 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
1859 {
1860         struct dm_integrity_c *ic = dio->ic;
1861         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1862         unsigned journal_section, journal_entry;
1863         unsigned journal_read_pos;
1864         struct completion read_comp;
1865         bool need_sync_io = ic->internal_hash && !dio->write;
1866
1867         if (need_sync_io && from_map) {
1868                 INIT_WORK(&dio->work, integrity_bio_wait);
1869                 queue_work(ic->offload_wq, &dio->work);
1870                 return;
1871         }
1872
1873 lock_retry:
1874         spin_lock_irq(&ic->endio_wait.lock);
1875 retry:
1876         if (unlikely(dm_integrity_failed(ic))) {
1877                 spin_unlock_irq(&ic->endio_wait.lock);
1878                 do_endio(ic, bio);
1879                 return;
1880         }
1881         dio->range.n_sectors = bio_sectors(bio);
1882         journal_read_pos = NOT_FOUND;
1883         if (likely(ic->mode == 'J')) {
1884                 if (dio->write) {
1885                         unsigned next_entry, i, pos;
1886                         unsigned ws, we, range_sectors;
1887
1888                         dio->range.n_sectors = min(dio->range.n_sectors,
1889                                                    (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
1890                         if (unlikely(!dio->range.n_sectors)) {
1891                                 if (from_map)
1892                                         goto offload_to_thread;
1893                                 sleep_on_endio_wait(ic);
1894                                 goto retry;
1895                         }
1896                         range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
1897                         ic->free_sectors -= range_sectors;
1898                         journal_section = ic->free_section;
1899                         journal_entry = ic->free_section_entry;
1900
1901                         next_entry = ic->free_section_entry + range_sectors;
1902                         ic->free_section_entry = next_entry % ic->journal_section_entries;
1903                         ic->free_section += next_entry / ic->journal_section_entries;
1904                         ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
1905                         wraparound_section(ic, &ic->free_section);
1906
1907                         pos = journal_section * ic->journal_section_entries + journal_entry;
1908                         ws = journal_section;
1909                         we = journal_entry;
1910                         i = 0;
1911                         do {
1912                                 struct journal_entry *je;
1913
1914                                 add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
1915                                 pos++;
1916                                 if (unlikely(pos >= ic->journal_entries))
1917                                         pos = 0;
1918
1919                                 je = access_journal_entry(ic, ws, we);
1920                                 BUG_ON(!journal_entry_is_unused(je));
1921                                 journal_entry_set_inprogress(je);
1922                                 we++;
1923                                 if (unlikely(we == ic->journal_section_entries)) {
1924                                         we = 0;
1925                                         ws++;
1926                                         wraparound_section(ic, &ws);
1927                                 }
1928                         } while ((i += ic->sectors_per_block) < dio->range.n_sectors);
1929
1930                         spin_unlock_irq(&ic->endio_wait.lock);
1931                         goto journal_read_write;
1932                 } else {
1933                         sector_t next_sector;
1934                         journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
1935                         if (likely(journal_read_pos == NOT_FOUND)) {
1936                                 if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
1937                                         dio->range.n_sectors = next_sector - dio->range.logical_sector;
1938                         } else {
1939                                 unsigned i;
1940                                 unsigned jp = journal_read_pos + 1;
1941                                 for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
1942                                         if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
1943                                                 break;
1944                                 }
1945                                 dio->range.n_sectors = i;
1946                         }
1947                 }
1948         }
1949         if (unlikely(!add_new_range(ic, &dio->range, true))) {
1950                 /*
1951                  * We must not sleep in the request routine because it could
1952                  * stall bios on current->bio_list.
1953                  * So, we offload the bio to a workqueue if we have to sleep.
1954                  */
1955                 if (from_map) {
1956 offload_to_thread:
1957                         spin_unlock_irq(&ic->endio_wait.lock);
1958                         INIT_WORK(&dio->work, integrity_bio_wait);
1959                         queue_work(ic->wait_wq, &dio->work);
1960                         return;
1961                 }
1962                 if (journal_read_pos != NOT_FOUND)
1963                         dio->range.n_sectors = ic->sectors_per_block;
1964                 wait_and_add_new_range(ic, &dio->range);
1965                 /*
1966                  * wait_and_add_new_range drops the spinlock, so the journal
1967                  * may have been changed arbitrarily. We need to recheck.
1968                  * To simplify the code, we restrict I/O size to just one block.
1969                  */
1970                 if (journal_read_pos != NOT_FOUND) {
1971                         sector_t next_sector;
1972                         unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
1973                         if (unlikely(new_pos != journal_read_pos)) {
1974                                 remove_range_unlocked(ic, &dio->range);
1975                                 goto retry;
1976                         }
1977                 }
1978         }
1979         spin_unlock_irq(&ic->endio_wait.lock);
1980
1981         if (unlikely(journal_read_pos != NOT_FOUND)) {
1982                 journal_section = journal_read_pos / ic->journal_section_entries;
1983                 journal_entry = journal_read_pos % ic->journal_section_entries;
1984                 goto journal_read_write;
1985         }
1986
1987         if (ic->mode == 'B' && dio->write) {
1988                 if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
1989                                      dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
1990                         struct bitmap_block_status *bbs;
1991
1992                         bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
1993                         spin_lock(&bbs->bio_queue_lock);
1994                         bio_list_add(&bbs->bio_queue, bio);
1995                         spin_unlock(&bbs->bio_queue_lock);
1996                         queue_work(ic->writer_wq, &bbs->work);
1997                         return;
1998                 }
1999         }
2000
2001         dio->in_flight = (atomic_t)ATOMIC_INIT(2);
2002
2003         if (need_sync_io) {
2004                 init_completion(&read_comp);
2005                 dio->completion = &read_comp;
2006         } else
2007                 dio->completion = NULL;
2008
2009         dio->orig_bi_iter = bio->bi_iter;
2010
2011         dio->orig_bi_disk = bio->bi_disk;
2012         dio->orig_bi_partno = bio->bi_partno;
2013         bio_set_dev(bio, ic->dev->bdev);
2014
2015         dio->orig_bi_integrity = bio_integrity(bio);
2016         bio->bi_integrity = NULL;
2017         bio->bi_opf &= ~REQ_INTEGRITY;
2018
2019         dio->orig_bi_end_io = bio->bi_end_io;
2020         bio->bi_end_io = integrity_end_io;
2021
2022         bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
2023         generic_make_request(bio);
2024
2025         if (need_sync_io) {
2026                 wait_for_completion_io(&read_comp);
2027                 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
2028                     dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
2029                         goto skip_check;
2030                 if (ic->mode == 'B') {
2031                         if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
2032                                              dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
2033                                 goto skip_check;
2034                 }
2035
2036                 if (likely(!bio->bi_status))
2037                         integrity_metadata(&dio->work);
2038                 else
2039 skip_check:
2040                         dec_in_flight(dio);
2041
2042         } else {
2043                 INIT_WORK(&dio->work, integrity_metadata);
2044                 queue_work(ic->metadata_wq, &dio->work);
2045         }
2046
2047         return;
2048
2049 journal_read_write:
2050         if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
2051                 goto lock_retry;
2052
2053         do_endio_flush(ic, dio);
2054 }
2055
2056
2057 static void integrity_bio_wait(struct work_struct *w)
2058 {
2059         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
2060
2061         dm_integrity_map_continue(dio, false);
2062 }
2063
2064 static void pad_uncommitted(struct dm_integrity_c *ic)
2065 {
2066         if (ic->free_section_entry) {
2067                 ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
2068                 ic->free_section_entry = 0;
2069                 ic->free_section++;
2070                 wraparound_section(ic, &ic->free_section);
2071                 ic->n_uncommitted_sections++;
2072         }
2073         if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
2074                     (ic->n_uncommitted_sections + ic->n_committed_sections) *
2075                     ic->journal_section_entries + ic->free_sectors)) {
2076                 DMCRIT("journal_sections %u, journal_section_entries %u, "
2077                        "n_uncommitted_sections %u, n_committed_sections %u, "
2078                        "journal_section_entries %u, free_sectors %u",
2079                        ic->journal_sections, ic->journal_section_entries,
2080                        ic->n_uncommitted_sections, ic->n_committed_sections,
2081                        ic->journal_section_entries, ic->free_sectors);
2082         }
2083 }
2084
2085 static void integrity_commit(struct work_struct *w)
2086 {
2087         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
2088         unsigned commit_start, commit_sections;
2089         unsigned i, j, n;
2090         struct bio *flushes;
2091
2092         del_timer(&ic->autocommit_timer);
2093
2094         spin_lock_irq(&ic->endio_wait.lock);
2095         flushes = bio_list_get(&ic->flush_bio_list);
2096         if (unlikely(ic->mode != 'J')) {
2097                 spin_unlock_irq(&ic->endio_wait.lock);
2098                 dm_integrity_flush_buffers(ic);
2099                 goto release_flush_bios;
2100         }
2101
2102         pad_uncommitted(ic);
2103         commit_start = ic->uncommitted_section;
2104         commit_sections = ic->n_uncommitted_sections;
2105         spin_unlock_irq(&ic->endio_wait.lock);
2106
2107         if (!commit_sections)
2108                 goto release_flush_bios;
2109
2110         i = commit_start;
2111         for (n = 0; n < commit_sections; n++) {
2112                 for (j = 0; j < ic->journal_section_entries; j++) {
2113                         struct journal_entry *je;
2114                         je = access_journal_entry(ic, i, j);
2115                         io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
2116                 }
2117                 for (j = 0; j < ic->journal_section_sectors; j++) {
2118                         struct journal_sector *js;
2119                         js = access_journal(ic, i, j);
2120                         js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
2121                 }
2122                 i++;
2123                 if (unlikely(i >= ic->journal_sections))
2124                         ic->commit_seq = next_commit_seq(ic->commit_seq);
2125                 wraparound_section(ic, &i);
2126         }
2127         smp_rmb();
2128
2129         write_journal(ic, commit_start, commit_sections);
2130
2131         spin_lock_irq(&ic->endio_wait.lock);
2132         ic->uncommitted_section += commit_sections;
2133         wraparound_section(ic, &ic->uncommitted_section);
2134         ic->n_uncommitted_sections -= commit_sections;
2135         ic->n_committed_sections += commit_sections;
2136         spin_unlock_irq(&ic->endio_wait.lock);
2137
2138         if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
2139                 queue_work(ic->writer_wq, &ic->writer_work);
2140
2141 release_flush_bios:
2142         while (flushes) {
2143                 struct bio *next = flushes->bi_next;
2144                 flushes->bi_next = NULL;
2145                 do_endio(ic, flushes);
2146                 flushes = next;
2147         }
2148 }
2149
2150 static void complete_copy_from_journal(unsigned long error, void *context)
2151 {
2152         struct journal_io *io = context;
2153         struct journal_completion *comp = io->comp;
2154         struct dm_integrity_c *ic = comp->ic;
2155         remove_range(ic, &io->range);
2156         mempool_free(io, &ic->journal_io_mempool);
2157         if (unlikely(error != 0))
2158                 dm_integrity_io_error(ic, "copying from journal", -EIO);
2159         complete_journal_op(comp);
2160 }
2161
2162 static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
2163                                struct journal_entry *je)
2164 {
2165         unsigned s = 0;
2166         do {
2167                 js->commit_id = je->last_bytes[s];
2168                 js++;
2169         } while (++s < ic->sectors_per_block);
2170 }
2171
2172 static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
2173                              unsigned write_sections, bool from_replay)
2174 {
2175         unsigned i, j, n;
2176         struct journal_completion comp;
2177         struct blk_plug plug;
2178
2179         blk_start_plug(&plug);
2180
2181         comp.ic = ic;
2182         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2183         init_completion(&comp.comp);
2184
2185         i = write_start;
2186         for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
2187 #ifndef INTERNAL_VERIFY
2188                 if (unlikely(from_replay))
2189 #endif
2190                         rw_section_mac(ic, i, false);
2191                 for (j = 0; j < ic->journal_section_entries; j++) {
2192                         struct journal_entry *je = access_journal_entry(ic, i, j);
2193                         sector_t sec, area, offset;
2194                         unsigned k, l, next_loop;
2195                         sector_t metadata_block;
2196                         unsigned metadata_offset;
2197                         struct journal_io *io;
2198
2199                         if (journal_entry_is_unused(je))
2200                                 continue;
2201                         BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
2202                         sec = journal_entry_get_sector(je);
2203                         if (unlikely(from_replay)) {
2204                                 if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
2205                                         dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
2206                                         sec &= ~(sector_t)(ic->sectors_per_block - 1);
2207                                 }
2208                         }
2209                         get_area_and_offset(ic, sec, &area, &offset);
2210                         restore_last_bytes(ic, access_journal_data(ic, i, j), je);
2211                         for (k = j + 1; k < ic->journal_section_entries; k++) {
2212                                 struct journal_entry *je2 = access_journal_entry(ic, i, k);
2213                                 sector_t sec2, area2, offset2;
2214                                 if (journal_entry_is_unused(je2))
2215                                         break;
2216                                 BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
2217                                 sec2 = journal_entry_get_sector(je2);
2218                                 get_area_and_offset(ic, sec2, &area2, &offset2);
2219                                 if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
2220                                         break;
2221                                 restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
2222                         }
2223                         next_loop = k - 1;
2224
2225                         io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO);
2226                         io->comp = &comp;
2227                         io->range.logical_sector = sec;
2228                         io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
2229
2230                         spin_lock_irq(&ic->endio_wait.lock);
2231                         add_new_range_and_wait(ic, &io->range);
2232
2233                         if (likely(!from_replay)) {
2234                                 struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
2235
2236                                 /* don't write if there is newer committed sector */
2237                                 while (j < k && find_newer_committed_node(ic, &section_node[j])) {
2238                                         struct journal_entry *je2 = access_journal_entry(ic, i, j);
2239
2240                                         journal_entry_set_unused(je2);
2241                                         remove_journal_node(ic, &section_node[j]);
2242                                         j++;
2243                                         sec += ic->sectors_per_block;
2244                                         offset += ic->sectors_per_block;
2245                                 }
2246                                 while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
2247                                         struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
2248
2249                                         journal_entry_set_unused(je2);
2250                                         remove_journal_node(ic, &section_node[k - 1]);
2251                                         k--;
2252                                 }
2253                                 if (j == k) {
2254                                         remove_range_unlocked(ic, &io->range);
2255                                         spin_unlock_irq(&ic->endio_wait.lock);
2256                                         mempool_free(io, &ic->journal_io_mempool);
2257                                         goto skip_io;
2258                                 }
2259                                 for (l = j; l < k; l++) {
2260                                         remove_journal_node(ic, &section_node[l]);
2261                                 }
2262                         }
2263                         spin_unlock_irq(&ic->endio_wait.lock);
2264
2265                         metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
2266                         for (l = j; l < k; l++) {
2267                                 int r;
2268                                 struct journal_entry *je2 = access_journal_entry(ic, i, l);
2269
2270                                 if (
2271 #ifndef INTERNAL_VERIFY
2272                                     unlikely(from_replay) &&
2273 #endif
2274                                     ic->internal_hash) {
2275                                         char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
2276
2277                                         integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
2278                                                                   (char *)access_journal_data(ic, i, l), test_tag);
2279                                         if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size)))
2280                                                 dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
2281                                 }
2282
2283                                 journal_entry_set_unused(je2);
2284                                 r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
2285                                                         ic->tag_size, TAG_WRITE);
2286                                 if (unlikely(r)) {
2287                                         dm_integrity_io_error(ic, "reading tags", r);
2288                                 }
2289                         }
2290
2291                         atomic_inc(&comp.in_flight);
2292                         copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
2293                                           (k - j) << ic->sb->log2_sectors_per_block,
2294                                           get_data_sector(ic, area, offset),
2295                                           complete_copy_from_journal, io);
2296 skip_io:
2297                         j = next_loop;
2298                 }
2299         }
2300
2301         dm_bufio_write_dirty_buffers_async(ic->bufio);
2302
2303         blk_finish_plug(&plug);
2304
2305         complete_journal_op(&comp);
2306         wait_for_completion_io(&comp.comp);
2307
2308         dm_integrity_flush_buffers(ic);
2309 }
2310
2311 static void integrity_writer(struct work_struct *w)
2312 {
2313         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
2314         unsigned write_start, write_sections;
2315
2316         unsigned prev_free_sectors;
2317
2318         /* the following test is not needed, but it tests the replay code */
2319         if (READ_ONCE(ic->suspending) && !ic->meta_dev)
2320                 return;
2321
2322         spin_lock_irq(&ic->endio_wait.lock);
2323         write_start = ic->committed_section;
2324         write_sections = ic->n_committed_sections;
2325         spin_unlock_irq(&ic->endio_wait.lock);
2326
2327         if (!write_sections)
2328                 return;
2329
2330         do_journal_write(ic, write_start, write_sections, false);
2331
2332         spin_lock_irq(&ic->endio_wait.lock);
2333
2334         ic->committed_section += write_sections;
2335         wraparound_section(ic, &ic->committed_section);
2336         ic->n_committed_sections -= write_sections;
2337
2338         prev_free_sectors = ic->free_sectors;
2339         ic->free_sectors += write_sections * ic->journal_section_entries;
2340         if (unlikely(!prev_free_sectors))
2341                 wake_up_locked(&ic->endio_wait);
2342
2343         spin_unlock_irq(&ic->endio_wait.lock);
2344 }
2345
2346 static void recalc_write_super(struct dm_integrity_c *ic)
2347 {
2348         int r;
2349
2350         dm_integrity_flush_buffers(ic);
2351         if (dm_integrity_failed(ic))
2352                 return;
2353
2354         r = sync_rw_sb(ic, REQ_OP_WRITE, 0);
2355         if (unlikely(r))
2356                 dm_integrity_io_error(ic, "writing superblock", r);
2357 }
2358
2359 static void integrity_recalc(struct work_struct *w)
2360 {
2361         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
2362         struct dm_integrity_range range;
2363         struct dm_io_request io_req;
2364         struct dm_io_region io_loc;
2365         sector_t area, offset;
2366         sector_t metadata_block;
2367         unsigned metadata_offset;
2368         sector_t logical_sector, n_sectors;
2369         __u8 *t;
2370         unsigned i;
2371         int r;
2372         unsigned super_counter = 0;
2373
2374         DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
2375
2376         spin_lock_irq(&ic->endio_wait.lock);
2377
2378 next_chunk:
2379
2380         if (unlikely(READ_ONCE(ic->suspending)))
2381                 goto unlock_ret;
2382
2383         range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
2384         if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
2385                 if (ic->mode == 'B') {
2386                         DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
2387                         queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
2388                 }
2389                 goto unlock_ret;
2390         }
2391
2392         get_area_and_offset(ic, range.logical_sector, &area, &offset);
2393         range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
2394         if (!ic->meta_dev)
2395                 range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
2396
2397         add_new_range_and_wait(ic, &range);
2398         spin_unlock_irq(&ic->endio_wait.lock);
2399         logical_sector = range.logical_sector;
2400         n_sectors = range.n_sectors;
2401
2402         if (ic->mode == 'B') {
2403                 if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
2404                         goto advance_and_next;
2405                 }
2406                 while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
2407                                        ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
2408                         logical_sector += ic->sectors_per_block;
2409                         n_sectors -= ic->sectors_per_block;
2410                         cond_resched();
2411                 }
2412                 while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
2413                                        ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
2414                         n_sectors -= ic->sectors_per_block;
2415                         cond_resched();
2416                 }
2417                 get_area_and_offset(ic, logical_sector, &area, &offset);
2418         }
2419
2420         DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
2421
2422         if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
2423                 recalc_write_super(ic);
2424                 if (ic->mode == 'B') {
2425                         queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
2426                 }
2427                 super_counter = 0;
2428         }
2429
2430         if (unlikely(dm_integrity_failed(ic)))
2431                 goto err;
2432
2433         io_req.bi_op = REQ_OP_READ;
2434         io_req.bi_op_flags = 0;
2435         io_req.mem.type = DM_IO_VMA;
2436         io_req.mem.ptr.addr = ic->recalc_buffer;
2437         io_req.notify.fn = NULL;
2438         io_req.client = ic->io;
2439         io_loc.bdev = ic->dev->bdev;
2440         io_loc.sector = get_data_sector(ic, area, offset);
2441         io_loc.count = n_sectors;
2442
2443         r = dm_io(&io_req, 1, &io_loc, NULL);
2444         if (unlikely(r)) {
2445                 dm_integrity_io_error(ic, "reading data", r);
2446                 goto err;
2447         }
2448
2449         t = ic->recalc_tags;
2450         for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
2451                 integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
2452                 t += ic->tag_size;
2453         }
2454
2455         metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
2456
2457         r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE);
2458         if (unlikely(r)) {
2459                 dm_integrity_io_error(ic, "writing tags", r);
2460                 goto err;
2461         }
2462
2463 advance_and_next:
2464         cond_resched();
2465
2466         spin_lock_irq(&ic->endio_wait.lock);
2467         remove_range_unlocked(ic, &range);
2468         ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
2469         goto next_chunk;
2470
2471 err:
2472         remove_range(ic, &range);
2473         return;
2474
2475 unlock_ret:
2476         spin_unlock_irq(&ic->endio_wait.lock);
2477
2478         recalc_write_super(ic);
2479 }
2480
2481 static void bitmap_block_work(struct work_struct *w)
2482 {
2483         struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
2484         struct dm_integrity_c *ic = bbs->ic;
2485         struct bio *bio;
2486         struct bio_list bio_queue;
2487         struct bio_list waiting;
2488
2489         bio_list_init(&waiting);
2490
2491         spin_lock(&bbs->bio_queue_lock);
2492         bio_queue = bbs->bio_queue;
2493         bio_list_init(&bbs->bio_queue);
2494         spin_unlock(&bbs->bio_queue_lock);
2495
2496         while ((bio = bio_list_pop(&bio_queue))) {
2497                 struct dm_integrity_io *dio;
2498
2499                 dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
2500
2501                 if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
2502                                     dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
2503                         remove_range(ic, &dio->range);
2504                         INIT_WORK(&dio->work, integrity_bio_wait);
2505                         queue_work(ic->offload_wq, &dio->work);
2506                 } else {
2507                         block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
2508                                         dio->range.n_sectors, BITMAP_OP_SET);
2509                         bio_list_add(&waiting, bio);
2510                 }
2511         }
2512
2513         if (bio_list_empty(&waiting))
2514                 return;
2515
2516         rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC,
2517                            bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
2518                            BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
2519
2520         while ((bio = bio_list_pop(&waiting))) {
2521                 struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
2522
2523                 block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
2524                                 dio->range.n_sectors, BITMAP_OP_SET);
2525
2526                 remove_range(ic, &dio->range);
2527                 INIT_WORK(&dio->work, integrity_bio_wait);
2528                 queue_work(ic->offload_wq, &dio->work);
2529         }
2530
2531         queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
2532 }
2533
2534 static void bitmap_flush_work(struct work_struct *work)
2535 {
2536         struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
2537         struct dm_integrity_range range;
2538         unsigned long limit;
2539         struct bio *bio;
2540
2541         dm_integrity_flush_buffers(ic);
2542
2543         range.logical_sector = 0;
2544         range.n_sectors = ic->provided_data_sectors;
2545
2546         spin_lock_irq(&ic->endio_wait.lock);
2547         add_new_range_and_wait(ic, &range);
2548         spin_unlock_irq(&ic->endio_wait.lock);
2549
2550         dm_integrity_flush_buffers(ic);
2551         if (ic->meta_dev)
2552                 blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
2553
2554         limit = ic->provided_data_sectors;
2555         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
2556                 limit = le64_to_cpu(ic->sb->recalc_sector)
2557                         >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
2558                         << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
2559         }
2560         /*DEBUG_print("zeroing journal\n");*/
2561         block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
2562         block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
2563
2564         rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
2565                            ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2566
2567         spin_lock_irq(&ic->endio_wait.lock);
2568         remove_range_unlocked(ic, &range);
2569         while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
2570                 bio_endio(bio);
2571                 spin_unlock_irq(&ic->endio_wait.lock);
2572                 spin_lock_irq(&ic->endio_wait.lock);
2573         }
2574         spin_unlock_irq(&ic->endio_wait.lock);
2575 }
2576
2577
2578 static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
2579                          unsigned n_sections, unsigned char commit_seq)
2580 {
2581         unsigned i, j, n;
2582
2583         if (!n_sections)
2584                 return;
2585
2586         for (n = 0; n < n_sections; n++) {
2587                 i = start_section + n;
2588                 wraparound_section(ic, &i);
2589                 for (j = 0; j < ic->journal_section_sectors; j++) {
2590                         struct journal_sector *js = access_journal(ic, i, j);
2591                         memset(&js->entries, 0, JOURNAL_SECTOR_DATA);
2592                         js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
2593                 }
2594                 for (j = 0; j < ic->journal_section_entries; j++) {
2595                         struct journal_entry *je = access_journal_entry(ic, i, j);
2596                         journal_entry_set_unused(je);
2597                 }
2598         }
2599
2600         write_journal(ic, start_section, n_sections);
2601 }
2602
2603 static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
2604 {
2605         unsigned char k;
2606         for (k = 0; k < N_COMMIT_IDS; k++) {
2607                 if (dm_integrity_commit_id(ic, i, j, k) == id)
2608                         return k;
2609         }
2610         dm_integrity_io_error(ic, "journal commit id", -EIO);
2611         return -EIO;
2612 }
2613
2614 static void replay_journal(struct dm_integrity_c *ic)
2615 {
2616         unsigned i, j;
2617         bool used_commit_ids[N_COMMIT_IDS];
2618         unsigned max_commit_id_sections[N_COMMIT_IDS];
2619         unsigned write_start, write_sections;
2620         unsigned continue_section;
2621         bool journal_empty;
2622         unsigned char unused, last_used, want_commit_seq;
2623
2624         if (ic->mode == 'R')
2625                 return;
2626
2627         if (ic->journal_uptodate)
2628                 return;
2629
2630         last_used = 0;
2631         write_start = 0;
2632
2633         if (!ic->just_formatted) {
2634                 DEBUG_print("reading journal\n");
2635                 rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
2636                 if (ic->journal_io)
2637                         DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
2638                 if (ic->journal_io) {
2639                         struct journal_completion crypt_comp;
2640                         crypt_comp.ic = ic;
2641                         init_completion(&crypt_comp.comp);
2642                         crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
2643                         encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
2644                         wait_for_completion(&crypt_comp.comp);
2645                 }
2646                 DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
2647         }
2648
2649         if (dm_integrity_failed(ic))
2650                 goto clear_journal;
2651
2652         journal_empty = true;
2653         memset(used_commit_ids, 0, sizeof used_commit_ids);
2654         memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
2655         for (i = 0; i < ic->journal_sections; i++) {
2656                 for (j = 0; j < ic->journal_section_sectors; j++) {
2657                         int k;
2658                         struct journal_sector *js = access_journal(ic, i, j);
2659                         k = find_commit_seq(ic, i, j, js->commit_id);
2660                         if (k < 0)
2661                                 goto clear_journal;
2662                         used_commit_ids[k] = true;
2663                         max_commit_id_sections[k] = i;
2664                 }
2665                 if (journal_empty) {
2666                         for (j = 0; j < ic->journal_section_entries; j++) {
2667                                 struct journal_entry *je = access_journal_entry(ic, i, j);
2668                                 if (!journal_entry_is_unused(je)) {
2669                                         journal_empty = false;
2670                                         break;
2671                                 }
2672                         }
2673                 }
2674         }
2675
2676         if (!used_commit_ids[N_COMMIT_IDS - 1]) {
2677                 unused = N_COMMIT_IDS - 1;
2678                 while (unused && !used_commit_ids[unused - 1])
2679                         unused--;
2680         } else {
2681                 for (unused = 0; unused < N_COMMIT_IDS; unused++)
2682                         if (!used_commit_ids[unused])
2683                                 break;
2684                 if (unused == N_COMMIT_IDS) {
2685                         dm_integrity_io_error(ic, "journal commit ids", -EIO);
2686                         goto clear_journal;
2687                 }
2688         }
2689         DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
2690                     unused, used_commit_ids[0], used_commit_ids[1],
2691                     used_commit_ids[2], used_commit_ids[3]);
2692
2693         last_used = prev_commit_seq(unused);
2694         want_commit_seq = prev_commit_seq(last_used);
2695
2696         if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
2697                 journal_empty = true;
2698
2699         write_start = max_commit_id_sections[last_used] + 1;
2700         if (unlikely(write_start >= ic->journal_sections))
2701                 want_commit_seq = next_commit_seq(want_commit_seq);
2702         wraparound_section(ic, &write_start);
2703
2704         i = write_start;
2705         for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
2706                 for (j = 0; j < ic->journal_section_sectors; j++) {
2707                         struct journal_sector *js = access_journal(ic, i, j);
2708
2709                         if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
2710                                 /*
2711                                  * This could be caused by crash during writing.
2712                                  * We won't replay the inconsistent part of the
2713                                  * journal.
2714                                  */
2715                                 DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
2716                                             i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
2717                                 goto brk;
2718                         }
2719                 }
2720                 i++;
2721                 if (unlikely(i >= ic->journal_sections))
2722                         want_commit_seq = next_commit_seq(want_commit_seq);
2723                 wraparound_section(ic, &i);
2724         }
2725 brk:
2726
2727         if (!journal_empty) {
2728                 DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
2729                             write_sections, write_start, want_commit_seq);
2730                 do_journal_write(ic, write_start, write_sections, true);
2731         }
2732
2733         if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
2734                 continue_section = write_start;
2735                 ic->commit_seq = want_commit_seq;
2736                 DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
2737         } else {
2738                 unsigned s;
2739                 unsigned char erase_seq;
2740 clear_journal:
2741                 DEBUG_print("clearing journal\n");
2742
2743                 erase_seq = prev_commit_seq(prev_commit_seq(last_used));
2744                 s = write_start;
2745                 init_journal(ic, s, 1, erase_seq);
2746                 s++;
2747                 wraparound_section(ic, &s);
2748                 if (ic->journal_sections >= 2) {
2749                         init_journal(ic, s, ic->journal_sections - 2, erase_seq);
2750                         s += ic->journal_sections - 2;
2751                         wraparound_section(ic, &s);
2752                         init_journal(ic, s, 1, erase_seq);
2753                 }
2754
2755                 continue_section = 0;
2756                 ic->commit_seq = next_commit_seq(erase_seq);
2757         }
2758
2759         ic->committed_section = continue_section;
2760         ic->n_committed_sections = 0;
2761
2762         ic->uncommitted_section = continue_section;
2763         ic->n_uncommitted_sections = 0;
2764
2765         ic->free_section = continue_section;
2766         ic->free_section_entry = 0;
2767         ic->free_sectors = ic->journal_entries;
2768
2769         ic->journal_tree_root = RB_ROOT;
2770         for (i = 0; i < ic->journal_entries; i++)
2771                 init_journal_node(&ic->journal_tree[i]);
2772 }
2773
2774 static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
2775 {
2776         DEBUG_print("dm_integrity_enter_synchronous_mode\n");
2777
2778         if (ic->mode == 'B') {
2779                 ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
2780                 ic->synchronous_mode = 1;
2781
2782                 cancel_delayed_work_sync(&ic->bitmap_flush_work);
2783                 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
2784                 flush_workqueue(ic->commit_wq);
2785         }
2786 }
2787
2788 static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
2789 {
2790         struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
2791
2792         DEBUG_print("dm_integrity_reboot\n");
2793
2794         dm_integrity_enter_synchronous_mode(ic);
2795
2796         return NOTIFY_DONE;
2797 }
2798
2799 static void dm_integrity_postsuspend(struct dm_target *ti)
2800 {
2801         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2802         int r;
2803
2804         WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
2805
2806         del_timer_sync(&ic->autocommit_timer);
2807
2808         WRITE_ONCE(ic->suspending, 1);
2809
2810         if (ic->recalc_wq)
2811                 drain_workqueue(ic->recalc_wq);
2812
2813         if (ic->mode == 'B')
2814                 cancel_delayed_work_sync(&ic->bitmap_flush_work);
2815
2816         queue_work(ic->commit_wq, &ic->commit_work);
2817         drain_workqueue(ic->commit_wq);
2818
2819         if (ic->mode == 'J') {
2820                 if (ic->meta_dev)
2821                         queue_work(ic->writer_wq, &ic->writer_work);
2822                 drain_workqueue(ic->writer_wq);
2823                 dm_integrity_flush_buffers(ic);
2824         }
2825
2826         if (ic->mode == 'B') {
2827                 dm_integrity_flush_buffers(ic);
2828 #if 1
2829                 /* set to 0 to test bitmap replay code */
2830                 init_journal(ic, 0, ic->journal_sections, 0);
2831                 ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
2832                 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
2833                 if (unlikely(r))
2834                         dm_integrity_io_error(ic, "writing superblock", r);
2835 #endif
2836         }
2837
2838         WRITE_ONCE(ic->suspending, 0);
2839
2840         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
2841
2842         ic->journal_uptodate = true;
2843 }
2844
2845 static void dm_integrity_resume(struct dm_target *ti)
2846 {
2847         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2848         int r;
2849         DEBUG_print("resume\n");
2850
2851         if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
2852                 DEBUG_print("resume dirty_bitmap\n");
2853                 rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
2854                                    ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2855                 if (ic->mode == 'B') {
2856                         if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
2857                                 block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
2858                                 block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
2859                                 if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
2860                                                      BITMAP_OP_TEST_ALL_CLEAR)) {
2861                                         ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
2862                                         ic->sb->recalc_sector = cpu_to_le64(0);
2863                                 }
2864                         } else {
2865                                 DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
2866                                             ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
2867                                 ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
2868                                 block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
2869                                 block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
2870                                 block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
2871                                 rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
2872                                                    ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2873                                 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
2874                                 ic->sb->recalc_sector = cpu_to_le64(0);
2875                         }
2876                 } else {
2877                         if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
2878                               block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
2879                                 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
2880                                 ic->sb->recalc_sector = cpu_to_le64(0);
2881                         }
2882                         init_journal(ic, 0, ic->journal_sections, 0);
2883                         replay_journal(ic);
2884                         ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
2885                 }
2886                 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
2887                 if (unlikely(r))
2888                         dm_integrity_io_error(ic, "writing superblock", r);
2889         } else {
2890                 replay_journal(ic);
2891                 if (ic->mode == 'B') {
2892                         ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
2893                         ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
2894                         r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
2895                         if (unlikely(r))
2896                                 dm_integrity_io_error(ic, "writing superblock", r);
2897
2898                         block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
2899                         block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
2900                         block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
2901                         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
2902                             le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors) {
2903                                 block_bitmap_op(ic, ic->journal, le64_to_cpu(ic->sb->recalc_sector),
2904                                                 ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
2905                                 block_bitmap_op(ic, ic->recalc_bitmap, le64_to_cpu(ic->sb->recalc_sector),
2906                                                 ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
2907                                 block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector),
2908                                                 ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
2909                         }
2910                         rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
2911                                            ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2912                 }
2913         }
2914
2915         DEBUG_print("testing recalc: %x\n", ic->sb->flags);
2916         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
2917                 __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
2918                 DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
2919                 if (recalc_pos < ic->provided_data_sectors) {
2920                         queue_work(ic->recalc_wq, &ic->recalc_work);
2921                 } else if (recalc_pos > ic->provided_data_sectors) {
2922                         ic->sb->recalc_sector = cpu_to_le64(ic->provided_data_sectors);
2923                         recalc_write_super(ic);
2924                 }
2925         }
2926
2927         ic->reboot_notifier.notifier_call = dm_integrity_reboot;
2928         ic->reboot_notifier.next = NULL;
2929         ic->reboot_notifier.priority = INT_MAX - 1;     /* be notified after md and before hardware drivers */
2930         WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
2931
2932 #if 0
2933         /* set to 1 to stress test synchronous mode */
2934         dm_integrity_enter_synchronous_mode(ic);
2935 #endif
2936 }
2937
2938 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
2939                                 unsigned status_flags, char *result, unsigned maxlen)
2940 {
2941         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2942         unsigned arg_count;
2943         size_t sz = 0;
2944
2945         switch (type) {
2946         case STATUSTYPE_INFO:
2947                 DMEMIT("%llu %llu",
2948                         (unsigned long long)atomic64_read(&ic->number_of_mismatches),
2949                         (unsigned long long)ic->provided_data_sectors);
2950                 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
2951                         DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector));
2952                 else
2953                         DMEMIT(" -");
2954                 break;
2955
2956         case STATUSTYPE_TABLE: {
2957                 __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
2958                 watermark_percentage += ic->journal_entries / 2;
2959                 do_div(watermark_percentage, ic->journal_entries);
2960                 arg_count = 3;
2961                 arg_count += !!ic->meta_dev;
2962                 arg_count += ic->sectors_per_block != 1;
2963                 arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
2964                 arg_count += ic->mode == 'J';
2965                 arg_count += ic->mode == 'J';
2966                 arg_count += ic->mode == 'B';
2967                 arg_count += ic->mode == 'B';
2968                 arg_count += !!ic->internal_hash_alg.alg_string;
2969                 arg_count += !!ic->journal_crypt_alg.alg_string;
2970                 arg_count += !!ic->journal_mac_alg.alg_string;
2971                 arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
2972                 DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
2973                        ic->tag_size, ic->mode, arg_count);
2974                 if (ic->meta_dev)
2975                         DMEMIT(" meta_device:%s", ic->meta_dev->name);
2976                 if (ic->sectors_per_block != 1)
2977                         DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
2978                 if (ic->recalculate_flag)
2979                         DMEMIT(" recalculate");
2980                 DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
2981                 DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
2982                 DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
2983                 if (ic->mode == 'J') {
2984                         DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
2985                         DMEMIT(" commit_time:%u", ic->autocommit_msec);
2986                 }
2987                 if (ic->mode == 'B') {
2988                         DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
2989                         DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
2990                 }
2991                 if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
2992                         DMEMIT(" fix_padding");
2993
2994 #define EMIT_ALG(a, n)                                                  \
2995                 do {                                                    \
2996                         if (ic->a.alg_string) {                         \
2997                                 DMEMIT(" %s:%s", n, ic->a.alg_string);  \
2998                                 if (ic->a.key_string)                   \
2999                                         DMEMIT(":%s", ic->a.key_string);\
3000                         }                                               \
3001                 } while (0)
3002                 EMIT_ALG(internal_hash_alg, "internal_hash");
3003                 EMIT_ALG(journal_crypt_alg, "journal_crypt");
3004                 EMIT_ALG(journal_mac_alg, "journal_mac");
3005                 break;
3006         }
3007         }
3008 }
3009
3010 static int dm_integrity_iterate_devices(struct dm_target *ti,
3011                                         iterate_devices_callout_fn fn, void *data)
3012 {
3013         struct dm_integrity_c *ic = ti->private;
3014
3015         if (!ic->meta_dev)
3016                 return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
3017         else
3018                 return fn(ti, ic->dev, 0, ti->len, data);
3019 }
3020
3021 static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
3022 {
3023         struct dm_integrity_c *ic = ti->private;
3024
3025         if (ic->sectors_per_block > 1) {
3026                 limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
3027                 limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
3028                 blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
3029         }
3030 }
3031
3032 static void calculate_journal_section_size(struct dm_integrity_c *ic)
3033 {
3034         unsigned sector_space = JOURNAL_SECTOR_DATA;
3035
3036         ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
3037         ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
3038                                          JOURNAL_ENTRY_ROUNDUP);
3039
3040         if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
3041                 sector_space -= JOURNAL_MAC_PER_SECTOR;
3042         ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
3043         ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
3044         ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
3045         ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
3046 }
3047
3048 static int calculate_device_limits(struct dm_integrity_c *ic)
3049 {
3050         __u64 initial_sectors;
3051
3052         calculate_journal_section_size(ic);
3053         initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
3054         if (initial_sectors + METADATA_PADDING_SECTORS >= ic->meta_device_sectors || initial_sectors > UINT_MAX)
3055                 return -EINVAL;
3056         ic->initial_sectors = initial_sectors;
3057
3058         if (!ic->meta_dev) {
3059                 sector_t last_sector, last_area, last_offset;
3060
3061                 /* we have to maintain excessive padding for compatibility with existing volumes */
3062                 __u64 metadata_run_padding =
3063                         ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ?
3064                         (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) :
3065                         (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS);
3066
3067                 ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
3068                                             metadata_run_padding) >> SECTOR_SHIFT;
3069                 if (!(ic->metadata_run & (ic->metadata_run - 1)))
3070                         ic->log2_metadata_run = __ffs(ic->metadata_run);
3071                 else
3072                         ic->log2_metadata_run = -1;
3073
3074                 get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
3075                 last_sector = get_data_sector(ic, last_area, last_offset);
3076                 if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
3077                         return -EINVAL;
3078         } else {
3079                 __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
3080                 meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
3081                                 >> (ic->log2_buffer_sectors + SECTOR_SHIFT);
3082                 meta_size <<= ic->log2_buffer_sectors;
3083                 if (ic->initial_sectors + meta_size < ic->initial_sectors ||
3084                     ic->initial_sectors + meta_size > ic->meta_device_sectors)
3085                         return -EINVAL;
3086                 ic->metadata_run = 1;
3087                 ic->log2_metadata_run = 0;
3088         }
3089
3090         return 0;
3091 }
3092
3093 static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
3094 {
3095         unsigned journal_sections;
3096         int test_bit;
3097
3098         memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
3099         memcpy(ic->sb->magic, SB_MAGIC, 8);
3100         ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
3101         ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
3102         if (ic->journal_mac_alg.alg_string)
3103                 ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
3104
3105         calculate_journal_section_size(ic);
3106         journal_sections = journal_sectors / ic->journal_section_sectors;
3107         if (!journal_sections)
3108                 journal_sections = 1;
3109
3110         if (!ic->meta_dev) {
3111                 if (ic->fix_padding)
3112                         ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING);
3113                 ic->sb->journal_sections = cpu_to_le32(journal_sections);
3114                 if (!interleave_sectors)
3115                         interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
3116                 ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
3117                 ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
3118                 ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
3119
3120                 ic->provided_data_sectors = 0;
3121                 for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
3122                         __u64 prev_data_sectors = ic->provided_data_sectors;
3123
3124                         ic->provided_data_sectors |= (sector_t)1 << test_bit;
3125                         if (calculate_device_limits(ic))
3126                                 ic->provided_data_sectors = prev_data_sectors;
3127                 }
3128                 if (!ic->provided_data_sectors)
3129                         return -EINVAL;
3130         } else {
3131                 ic->sb->log2_interleave_sectors = 0;
3132                 ic->provided_data_sectors = ic->data_device_sectors;
3133                 ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
3134
3135 try_smaller_buffer:
3136                 ic->sb->journal_sections = cpu_to_le32(0);
3137                 for (test_bit = fls(journal_sections) - 1; test_bit >= 0; test_bit--) {
3138                         __u32 prev_journal_sections = le32_to_cpu(ic->sb->journal_sections);
3139                         __u32 test_journal_sections = prev_journal_sections | (1U << test_bit);
3140                         if (test_journal_sections > journal_sections)
3141                                 continue;
3142                         ic->sb->journal_sections = cpu_to_le32(test_journal_sections);
3143                         if (calculate_device_limits(ic))
3144                                 ic->sb->journal_sections = cpu_to_le32(prev_journal_sections);
3145
3146                 }
3147                 if (!le32_to_cpu(ic->sb->journal_sections)) {
3148                         if (ic->log2_buffer_sectors > 3) {
3149                                 ic->log2_buffer_sectors--;
3150                                 goto try_smaller_buffer;
3151                         }
3152                         return -EINVAL;
3153                 }
3154         }
3155
3156         ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
3157
3158         sb_set_version(ic);
3159
3160         return 0;
3161 }
3162
3163 static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
3164 {
3165         struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
3166         struct blk_integrity bi;
3167
3168         memset(&bi, 0, sizeof(bi));
3169         bi.profile = &dm_integrity_profile;
3170         bi.tuple_size = ic->tag_size;
3171         bi.tag_size = bi.tuple_size;
3172         bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
3173
3174         blk_integrity_register(disk, &bi);
3175         blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
3176 }
3177
3178 static void dm_integrity_free_page_list(struct page_list *pl)
3179 {
3180         unsigned i;
3181
3182         if (!pl)
3183                 return;
3184         for (i = 0; pl[i].page; i++)
3185                 __free_page(pl[i].page);
3186         kvfree(pl);
3187 }
3188
3189 static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages)
3190 {
3191         struct page_list *pl;
3192         unsigned i;
3193
3194         pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
3195         if (!pl)
3196                 return NULL;
3197
3198         for (i = 0; i < n_pages; i++) {
3199                 pl[i].page = alloc_page(GFP_KERNEL);
3200                 if (!pl[i].page) {
3201                         dm_integrity_free_page_list(pl);
3202                         return NULL;
3203                 }
3204                 if (i)
3205                         pl[i - 1].next = &pl[i];
3206         }
3207         pl[i].page = NULL;
3208         pl[i].next = NULL;
3209
3210         return pl;
3211 }
3212
3213 static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
3214 {
3215         unsigned i;
3216         for (i = 0; i < ic->journal_sections; i++)
3217                 kvfree(sl[i]);
3218         kvfree(sl);
3219 }
3220
3221 static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
3222                                                                    struct page_list *pl)
3223 {
3224         struct scatterlist **sl;
3225         unsigned i;
3226
3227         sl = kvmalloc_array(ic->journal_sections,
3228                             sizeof(struct scatterlist *),
3229                             GFP_KERNEL | __GFP_ZERO);
3230         if (!sl)
3231                 return NULL;
3232
3233         for (i = 0; i < ic->journal_sections; i++) {
3234                 struct scatterlist *s;
3235                 unsigned start_index, start_offset;
3236                 unsigned end_index, end_offset;
3237                 unsigned n_pages;
3238                 unsigned idx;
3239
3240                 page_list_location(ic, i, 0, &start_index, &start_offset);
3241                 page_list_location(ic, i, ic->journal_section_sectors - 1,
3242                                    &end_index, &end_offset);
3243
3244                 n_pages = (end_index - start_index + 1);
3245
3246                 s = kvmalloc_array(n_pages, sizeof(struct scatterlist),
3247                                    GFP_KERNEL);
3248                 if (!s) {
3249                         dm_integrity_free_journal_scatterlist(ic, sl);
3250                         return NULL;
3251                 }
3252
3253                 sg_init_table(s, n_pages);
3254                 for (idx = start_index; idx <= end_index; idx++) {
3255                         char *va = lowmem_page_address(pl[idx].page);
3256                         unsigned start = 0, end = PAGE_SIZE;
3257                         if (idx == start_index)
3258                                 start = start_offset;
3259                         if (idx == end_index)
3260                                 end = end_offset + (1 << SECTOR_SHIFT);
3261                         sg_set_buf(&s[idx - start_index], va + start, end - start);
3262                 }
3263
3264                 sl[i] = s;
3265         }
3266
3267         return sl;
3268 }
3269
3270 static void free_alg(struct alg_spec *a)
3271 {
3272         kzfree(a->alg_string);
3273         kzfree(a->key);
3274         memset(a, 0, sizeof *a);
3275 }
3276
3277 static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
3278 {
3279         char *k;
3280
3281         free_alg(a);
3282
3283         a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
3284         if (!a->alg_string)
3285                 goto nomem;
3286
3287         k = strchr(a->alg_string, ':');
3288         if (k) {
3289                 *k = 0;
3290                 a->key_string = k + 1;
3291                 if (strlen(a->key_string) & 1)
3292                         goto inval;
3293
3294                 a->key_size = strlen(a->key_string) / 2;
3295                 a->key = kmalloc(a->key_size, GFP_KERNEL);
3296                 if (!a->key)
3297                         goto nomem;
3298                 if (hex2bin(a->key, a->key_string, a->key_size))
3299                         goto inval;
3300         }
3301
3302         return 0;
3303 inval:
3304         *error = error_inval;
3305         return -EINVAL;
3306 nomem:
3307         *error = "Out of memory for an argument";
3308         return -ENOMEM;
3309 }
3310
3311 static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
3312                    char *error_alg, char *error_key)
3313 {
3314         int r;
3315
3316         if (a->alg_string) {
3317                 *hash = crypto_alloc_shash(a->alg_string, 0, 0);
3318                 if (IS_ERR(*hash)) {
3319                         *error = error_alg;
3320                         r = PTR_ERR(*hash);
3321                         *hash = NULL;
3322                         return r;
3323                 }
3324
3325                 if (a->key) {
3326                         r = crypto_shash_setkey(*hash, a->key, a->key_size);
3327                         if (r) {
3328                                 *error = error_key;
3329                                 return r;
3330                         }
3331                 } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
3332                         *error = error_key;
3333                         return -ENOKEY;
3334                 }
3335         }
3336
3337         return 0;
3338 }
3339
3340 static int create_journal(struct dm_integrity_c *ic, char **error)
3341 {
3342         int r = 0;
3343         unsigned i;
3344         __u64 journal_pages, journal_desc_size, journal_tree_size;
3345         unsigned char *crypt_data = NULL, *crypt_iv = NULL;
3346         struct skcipher_request *req = NULL;
3347
3348         ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
3349         ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
3350         ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
3351         ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
3352
3353         journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
3354                                 PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
3355         journal_desc_size = journal_pages * sizeof(struct page_list);
3356         if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) {
3357                 *error = "Journal doesn't fit into memory";
3358                 r = -ENOMEM;
3359                 goto bad;
3360         }
3361         ic->journal_pages = journal_pages;
3362
3363         ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
3364         if (!ic->journal) {
3365                 *error = "Could not allocate memory for journal";
3366                 r = -ENOMEM;
3367                 goto bad;
3368         }
3369         if (ic->journal_crypt_alg.alg_string) {
3370                 unsigned ivsize, blocksize;
3371                 struct journal_completion comp;
3372
3373                 comp.ic = ic;
3374                 ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
3375                 if (IS_ERR(ic->journal_crypt)) {
3376                         *error = "Invalid journal cipher";
3377                         r = PTR_ERR(ic->journal_crypt);
3378                         ic->journal_crypt = NULL;
3379                         goto bad;
3380                 }
3381                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
3382                 blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
3383
3384                 if (ic->journal_crypt_alg.key) {
3385                         r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
3386                                                    ic->journal_crypt_alg.key_size);
3387                         if (r) {
3388                                 *error = "Error setting encryption key";
3389                                 goto bad;
3390                         }
3391                 }
3392                 DEBUG_print("cipher %s, block size %u iv size %u\n",
3393                             ic->journal_crypt_alg.alg_string, blocksize, ivsize);
3394
3395                 ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
3396                 if (!ic->journal_io) {
3397                         *error = "Could not allocate memory for journal io";
3398                         r = -ENOMEM;
3399                         goto bad;
3400                 }
3401
3402                 if (blocksize == 1) {
3403                         struct scatterlist *sg;
3404
3405                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
3406                         if (!req) {
3407                                 *error = "Could not allocate crypt request";
3408                                 r = -ENOMEM;
3409                                 goto bad;
3410                         }
3411
3412                         crypt_iv = kzalloc(ivsize, GFP_KERNEL);
3413                         if (!crypt_iv) {
3414                                 *error = "Could not allocate iv";
3415                                 r = -ENOMEM;
3416                                 goto bad;
3417                         }
3418
3419                         ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
3420                         if (!ic->journal_xor) {
3421                                 *error = "Could not allocate memory for journal xor";
3422                                 r = -ENOMEM;
3423                                 goto bad;
3424                         }
3425
3426                         sg = kvmalloc_array(ic->journal_pages + 1,
3427                                             sizeof(struct scatterlist),
3428                                             GFP_KERNEL);
3429                         if (!sg) {
3430                                 *error = "Unable to allocate sg list";
3431                                 r = -ENOMEM;
3432                                 goto bad;
3433                         }
3434                         sg_init_table(sg, ic->journal_pages + 1);
3435                         for (i = 0; i < ic->journal_pages; i++) {
3436                                 char *va = lowmem_page_address(ic->journal_xor[i].page);
3437                                 clear_page(va);
3438                                 sg_set_buf(&sg[i], va, PAGE_SIZE);
3439                         }
3440                         sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
3441
3442                         skcipher_request_set_crypt(req, sg, sg,
3443                                                    PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
3444                         init_completion(&comp.comp);
3445                         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
3446                         if (do_crypt(true, req, &comp))
3447                                 wait_for_completion(&comp.comp);
3448                         kvfree(sg);
3449                         r = dm_integrity_failed(ic);
3450                         if (r) {
3451                                 *error = "Unable to encrypt journal";
3452                                 goto bad;
3453                         }
3454                         DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
3455
3456                         crypto_free_skcipher(ic->journal_crypt);
3457                         ic->journal_crypt = NULL;
3458                 } else {
3459                         unsigned crypt_len = roundup(ivsize, blocksize);
3460
3461                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
3462                         if (!req) {
3463                                 *error = "Could not allocate crypt request";
3464                                 r = -ENOMEM;
3465                                 goto bad;
3466                         }
3467
3468                         crypt_iv = kmalloc(ivsize, GFP_KERNEL);
3469                         if (!crypt_iv) {
3470                                 *error = "Could not allocate iv";
3471                                 r = -ENOMEM;
3472                                 goto bad;
3473                         }
3474
3475                         crypt_data = kmalloc(crypt_len, GFP_KERNEL);
3476                         if (!crypt_data) {
3477                                 *error = "Unable to allocate crypt data";
3478                                 r = -ENOMEM;
3479                                 goto bad;
3480                         }
3481
3482                         ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
3483                         if (!ic->journal_scatterlist) {
3484                                 *error = "Unable to allocate sg list";
3485                                 r = -ENOMEM;
3486                                 goto bad;
3487                         }
3488                         ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
3489                         if (!ic->journal_io_scatterlist) {
3490                                 *error = "Unable to allocate sg list";
3491                                 r = -ENOMEM;
3492                                 goto bad;
3493                         }
3494                         ic->sk_requests = kvmalloc_array(ic->journal_sections,
3495                                                          sizeof(struct skcipher_request *),
3496                                                          GFP_KERNEL | __GFP_ZERO);
3497                         if (!ic->sk_requests) {
3498                                 *error = "Unable to allocate sk requests";
3499                                 r = -ENOMEM;
3500                                 goto bad;
3501                         }
3502                         for (i = 0; i < ic->journal_sections; i++) {
3503                                 struct scatterlist sg;
3504                                 struct skcipher_request *section_req;
3505                                 __u32 section_le = cpu_to_le32(i);
3506
3507                                 memset(crypt_iv, 0x00, ivsize);
3508                                 memset(crypt_data, 0x00, crypt_len);
3509                                 memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
3510
3511                                 sg_init_one(&sg, crypt_data, crypt_len);
3512                                 skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv);
3513                                 init_completion(&comp.comp);
3514                                 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
3515                                 if (do_crypt(true, req, &comp))
3516                                         wait_for_completion(&comp.comp);
3517
3518                                 r = dm_integrity_failed(ic);
3519                                 if (r) {
3520                                         *error = "Unable to generate iv";
3521                                         goto bad;
3522                                 }
3523
3524                                 section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
3525                                 if (!section_req) {
3526                                         *error = "Unable to allocate crypt request";
3527                                         r = -ENOMEM;
3528                                         goto bad;
3529                                 }
3530                                 section_req->iv = kmalloc_array(ivsize, 2,
3531                                                                 GFP_KERNEL);
3532                                 if (!section_req->iv) {
3533                                         skcipher_request_free(section_req);
3534                                         *error = "Unable to allocate iv";
3535                                         r = -ENOMEM;
3536                                         goto bad;
3537                                 }
3538                                 memcpy(section_req->iv + ivsize, crypt_data, ivsize);
3539                                 section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
3540                                 ic->sk_requests[i] = section_req;
3541                                 DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
3542                         }
3543                 }
3544         }
3545
3546         for (i = 0; i < N_COMMIT_IDS; i++) {
3547                 unsigned j;
3548 retest_commit_id:
3549                 for (j = 0; j < i; j++) {
3550                         if (ic->commit_ids[j] == ic->commit_ids[i]) {
3551                                 ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
3552                                 goto retest_commit_id;
3553                         }
3554                 }
3555                 DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
3556         }
3557
3558         journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
3559         if (journal_tree_size > ULONG_MAX) {
3560                 *error = "Journal doesn't fit into memory";
3561                 r = -ENOMEM;
3562                 goto bad;
3563         }
3564         ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
3565         if (!ic->journal_tree) {
3566                 *error = "Could not allocate memory for journal tree";
3567                 r = -ENOMEM;
3568         }
3569 bad:
3570         kfree(crypt_data);
3571         kfree(crypt_iv);
3572         skcipher_request_free(req);
3573
3574         return r;
3575 }
3576
3577 /*
3578  * Construct a integrity mapping
3579  *
3580  * Arguments:
3581  *      device
3582  *      offset from the start of the device
3583  *      tag size
3584  *      D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
3585  *      number of optional arguments
3586  *      optional arguments:
3587  *              journal_sectors
3588  *              interleave_sectors
3589  *              buffer_sectors
3590  *              journal_watermark
3591  *              commit_time
3592  *              meta_device
3593  *              block_size
3594  *              sectors_per_bit
3595  *              bitmap_flush_interval
3596  *              internal_hash
3597  *              journal_crypt
3598  *              journal_mac
3599  *              recalculate
3600  */
3601 static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3602 {
3603         struct dm_integrity_c *ic;
3604         char dummy;
3605         int r;
3606         unsigned extra_args;
3607         struct dm_arg_set as;
3608         static const struct dm_arg _args[] = {
3609                 {0, 9, "Invalid number of feature args"},
3610         };
3611         unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
3612         bool should_write_sb;
3613         __u64 threshold;
3614         unsigned long long start;
3615         __s8 log2_sectors_per_bitmap_bit = -1;
3616         __s8 log2_blocks_per_bitmap_bit;
3617         __u64 bits_in_journal;
3618         __u64 n_bitmap_bits;
3619
3620 #define DIRECT_ARGUMENTS        4
3621
3622         if (argc <= DIRECT_ARGUMENTS) {
3623                 ti->error = "Invalid argument count";
3624                 return -EINVAL;
3625         }
3626
3627         ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
3628         if (!ic) {
3629                 ti->error = "Cannot allocate integrity context";
3630                 return -ENOMEM;
3631         }
3632         ti->private = ic;
3633         ti->per_io_data_size = sizeof(struct dm_integrity_io);
3634
3635         ic->in_progress = RB_ROOT;
3636         INIT_LIST_HEAD(&ic->wait_list);
3637         init_waitqueue_head(&ic->endio_wait);
3638         bio_list_init(&ic->flush_bio_list);
3639         init_waitqueue_head(&ic->copy_to_journal_wait);
3640         init_completion(&ic->crypto_backoff);
3641         atomic64_set(&ic->number_of_mismatches, 0);
3642         ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
3643
3644         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
3645         if (r) {
3646                 ti->error = "Device lookup failed";
3647                 goto bad;
3648         }
3649
3650         if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
3651                 ti->error = "Invalid starting offset";
3652                 r = -EINVAL;
3653                 goto bad;
3654         }
3655         ic->start = start;
3656
3657         if (strcmp(argv[2], "-")) {
3658                 if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
3659                         ti->error = "Invalid tag size";
3660                         r = -EINVAL;
3661                         goto bad;
3662                 }
3663         }
3664
3665         if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
3666             !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
3667                 ic->mode = argv[3][0];
3668         } else {
3669                 ti->error = "Invalid mode (expecting J, B, D, R)";
3670                 r = -EINVAL;
3671                 goto bad;
3672         }
3673
3674         journal_sectors = 0;
3675         interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
3676         buffer_sectors = DEFAULT_BUFFER_SECTORS;
3677         journal_watermark = DEFAULT_JOURNAL_WATERMARK;
3678         sync_msec = DEFAULT_SYNC_MSEC;
3679         ic->sectors_per_block = 1;
3680
3681         as.argc = argc - DIRECT_ARGUMENTS;
3682         as.argv = argv + DIRECT_ARGUMENTS;
3683         r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
3684         if (r)
3685                 goto bad;
3686
3687         while (extra_args--) {
3688                 const char *opt_string;
3689                 unsigned val;
3690                 unsigned long long llval;
3691                 opt_string = dm_shift_arg(&as);
3692                 if (!opt_string) {
3693                         r = -EINVAL;
3694                         ti->error = "Not enough feature arguments";
3695                         goto bad;
3696                 }
3697                 if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
3698                         journal_sectors = val ? val : 1;
3699                 else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
3700                         interleave_sectors = val;
3701                 else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
3702                         buffer_sectors = val;
3703                 else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
3704                         journal_watermark = val;
3705                 else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
3706                         sync_msec = val;
3707                 else if (!strncmp(opt_string, "meta_device:", strlen("meta_device:"))) {
3708                         if (ic->meta_dev) {
3709                                 dm_put_device(ti, ic->meta_dev);
3710                                 ic->meta_dev = NULL;
3711                         }
3712                         r = dm_get_device(ti, strchr(opt_string, ':') + 1,
3713                                           dm_table_get_mode(ti->table), &ic->meta_dev);
3714                         if (r) {
3715                                 ti->error = "Device lookup failed";
3716                                 goto bad;
3717                         }
3718                 } else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
3719                         if (val < 1 << SECTOR_SHIFT ||
3720                             val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
3721                             (val & (val -1))) {
3722                                 r = -EINVAL;
3723                                 ti->error = "Invalid block_size argument";
3724                                 goto bad;
3725                         }
3726                         ic->sectors_per_block = val >> SECTOR_SHIFT;
3727                 } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
3728                         log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
3729                 } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
3730                         if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
3731                                 r = -EINVAL;
3732                                 ti->error = "Invalid bitmap_flush_interval argument";
3733                         }
3734                         ic->bitmap_flush_interval = msecs_to_jiffies(val);
3735                 } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
3736                         r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
3737                                             "Invalid internal_hash argument");
3738                         if (r)
3739                                 goto bad;
3740                 } else if (!strncmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
3741                         r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
3742                                             "Invalid journal_crypt argument");
3743                         if (r)
3744                                 goto bad;
3745                 } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
3746                         r = get_alg_and_key(opt_string, &ic->journal_mac_alg,  &ti->error,
3747                                             "Invalid journal_mac argument");
3748                         if (r)
3749                                 goto bad;
3750                 } else if (!strcmp(opt_string, "recalculate")) {
3751                         ic->recalculate_flag = true;
3752                 } else if (!strcmp(opt_string, "fix_padding")) {
3753                         ic->fix_padding = true;
3754                 } else {
3755                         r = -EINVAL;
3756                         ti->error = "Invalid argument";
3757                         goto bad;
3758                 }
3759         }
3760
3761         ic->data_device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
3762         if (!ic->meta_dev)
3763                 ic->meta_device_sectors = ic->data_device_sectors;
3764         else
3765                 ic->meta_device_sectors = i_size_read(ic->meta_dev->bdev->bd_inode) >> SECTOR_SHIFT;
3766
3767         if (!journal_sectors) {
3768                 journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
3769                                       ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
3770         }
3771
3772         if (!buffer_sectors)
3773                 buffer_sectors = 1;
3774         ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT);
3775
3776         r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
3777                     "Invalid internal hash", "Error setting internal hash key");
3778         if (r)
3779                 goto bad;
3780
3781         r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
3782                     "Invalid journal mac", "Error setting journal mac key");
3783         if (r)
3784                 goto bad;
3785
3786         if (!ic->tag_size) {
3787                 if (!ic->internal_hash) {
3788                         ti->error = "Unknown tag size";
3789                         r = -EINVAL;
3790                         goto bad;
3791                 }
3792                 ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
3793         }
3794         if (ic->tag_size > MAX_TAG_SIZE) {
3795                 ti->error = "Too big tag size";
3796                 r = -EINVAL;
3797                 goto bad;
3798         }
3799         if (!(ic->tag_size & (ic->tag_size - 1)))
3800                 ic->log2_tag_size = __ffs(ic->tag_size);
3801         else
3802                 ic->log2_tag_size = -1;
3803
3804         if (ic->mode == 'B' && !ic->internal_hash) {
3805                 r = -EINVAL;
3806                 ti->error = "Bitmap mode can be only used with internal hash";
3807                 goto bad;
3808         }
3809
3810         ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
3811         ic->autocommit_msec = sync_msec;
3812         timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
3813
3814         ic->io = dm_io_client_create();
3815         if (IS_ERR(ic->io)) {
3816                 r = PTR_ERR(ic->io);
3817                 ic->io = NULL;
3818                 ti->error = "Cannot allocate dm io";
3819                 goto bad;
3820         }
3821
3822         r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache);
3823         if (r) {
3824                 ti->error = "Cannot allocate mempool";
3825                 goto bad;
3826         }
3827
3828         ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
3829                                           WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
3830         if (!ic->metadata_wq) {
3831                 ti->error = "Cannot allocate workqueue";
3832                 r = -ENOMEM;
3833                 goto bad;
3834         }
3835
3836         /*
3837          * If this workqueue were percpu, it would cause bio reordering
3838          * and reduced performance.
3839          */
3840         ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3841         if (!ic->wait_wq) {
3842                 ti->error = "Cannot allocate workqueue";
3843                 r = -ENOMEM;
3844                 goto bad;
3845         }
3846
3847         ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM,
3848                                           METADATA_WORKQUEUE_MAX_ACTIVE);
3849         if (!ic->offload_wq) {
3850                 ti->error = "Cannot allocate workqueue";
3851                 r = -ENOMEM;
3852                 goto bad;
3853         }
3854
3855         ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
3856         if (!ic->commit_wq) {
3857                 ti->error = "Cannot allocate workqueue";
3858                 r = -ENOMEM;
3859                 goto bad;
3860         }
3861         INIT_WORK(&ic->commit_work, integrity_commit);
3862
3863         if (ic->mode == 'J' || ic->mode == 'B') {
3864                 ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
3865                 if (!ic->writer_wq) {
3866                         ti->error = "Cannot allocate workqueue";
3867                         r = -ENOMEM;
3868                         goto bad;
3869                 }
3870                 INIT_WORK(&ic->writer_work, integrity_writer);
3871         }
3872
3873         ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
3874         if (!ic->sb) {
3875                 r = -ENOMEM;
3876                 ti->error = "Cannot allocate superblock area";
3877                 goto bad;
3878         }
3879
3880         r = sync_rw_sb(ic, REQ_OP_READ, 0);
3881         if (r) {
3882                 ti->error = "Error reading superblock";
3883                 goto bad;
3884         }
3885         should_write_sb = false;
3886         if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
3887                 if (ic->mode != 'R') {
3888                         if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
3889                                 r = -EINVAL;
3890                                 ti->error = "The device is not initialized";
3891                                 goto bad;
3892                         }
3893                 }
3894
3895                 r = initialize_superblock(ic, journal_sectors, interleave_sectors);
3896                 if (r) {
3897                         ti->error = "Could not initialize superblock";
3898                         goto bad;
3899                 }
3900                 if (ic->mode != 'R')
3901                         should_write_sb = true;
3902         }
3903
3904         if (!ic->sb->version || ic->sb->version > SB_VERSION_4) {
3905                 r = -EINVAL;
3906                 ti->error = "Unknown version";
3907                 goto bad;
3908         }
3909         if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
3910                 r = -EINVAL;
3911                 ti->error = "Tag size doesn't match the information in superblock";
3912                 goto bad;
3913         }
3914         if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
3915                 r = -EINVAL;
3916                 ti->error = "Block size doesn't match the information in superblock";
3917                 goto bad;
3918         }
3919         if (!le32_to_cpu(ic->sb->journal_sections)) {
3920                 r = -EINVAL;
3921                 ti->error = "Corrupted superblock, journal_sections is 0";
3922                 goto bad;
3923         }
3924         /* make sure that ti->max_io_len doesn't overflow */
3925         if (!ic->meta_dev) {
3926                 if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
3927                     ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
3928                         r = -EINVAL;
3929                         ti->error = "Invalid interleave_sectors in the superblock";
3930                         goto bad;
3931                 }
3932         } else {
3933                 if (ic->sb->log2_interleave_sectors) {
3934                         r = -EINVAL;
3935                         ti->error = "Invalid interleave_sectors in the superblock";
3936                         goto bad;
3937                 }
3938         }
3939         ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
3940         if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
3941                 /* test for overflow */
3942                 r = -EINVAL;
3943                 ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
3944                 goto bad;
3945         }
3946         if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
3947                 r = -EINVAL;
3948                 ti->error = "Journal mac mismatch";
3949                 goto bad;
3950         }
3951
3952 try_smaller_buffer:
3953         r = calculate_device_limits(ic);
3954         if (r) {
3955                 if (ic->meta_dev) {
3956                         if (ic->log2_buffer_sectors > 3) {
3957                                 ic->log2_buffer_sectors--;
3958                                 goto try_smaller_buffer;
3959                         }
3960                 }
3961                 ti->error = "The device is too small";
3962                 goto bad;
3963         }
3964
3965         if (log2_sectors_per_bitmap_bit < 0)
3966                 log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
3967         if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
3968                 log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
3969
3970         bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
3971         if (bits_in_journal > UINT_MAX)
3972                 bits_in_journal = UINT_MAX;
3973         while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
3974                 log2_sectors_per_bitmap_bit++;
3975
3976         log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
3977         ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
3978         if (should_write_sb) {
3979                 ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
3980         }
3981         n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
3982                                 + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
3983         ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
3984
3985         if (!ic->meta_dev)
3986                 ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
3987
3988         if (ti->len > ic->provided_data_sectors) {
3989                 r = -EINVAL;
3990                 ti->error = "Not enough provided sectors for requested mapping size";
3991                 goto bad;
3992         }
3993
3994
3995         threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
3996         threshold += 50;
3997         do_div(threshold, 100);
3998         ic->free_sectors_threshold = threshold;
3999
4000         DEBUG_print("initialized:\n");
4001         DEBUG_print("   integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
4002         DEBUG_print("   journal_entry_size %u\n", ic->journal_entry_size);
4003         DEBUG_print("   journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
4004         DEBUG_print("   journal_section_entries %u\n", ic->journal_section_entries);
4005         DEBUG_print("   journal_section_sectors %u\n", ic->journal_section_sectors);
4006         DEBUG_print("   journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
4007         DEBUG_print("   journal_entries %u\n", ic->journal_entries);
4008         DEBUG_print("   log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
4009         DEBUG_print("   data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
4010         DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
4011         DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
4012         DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
4013         DEBUG_print("   provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
4014                     (unsigned long long)ic->provided_data_sectors);
4015         DEBUG_print("   log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
4016         DEBUG_print("   bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
4017
4018         if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
4019                 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
4020                 ic->sb->recalc_sector = cpu_to_le64(0);
4021         }
4022
4023         if (ic->internal_hash) {
4024                 ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
4025                 if (!ic->recalc_wq ) {
4026                         ti->error = "Cannot allocate workqueue";
4027                         r = -ENOMEM;
4028                         goto bad;
4029                 }
4030                 INIT_WORK(&ic->recalc_work, integrity_recalc);
4031                 ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
4032                 if (!ic->recalc_buffer) {
4033                         ti->error = "Cannot allocate buffer for recalculating";
4034                         r = -ENOMEM;
4035                         goto bad;
4036                 }
4037                 ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block,
4038                                                  ic->tag_size, GFP_KERNEL);
4039                 if (!ic->recalc_tags) {
4040                         ti->error = "Cannot allocate tags for recalculating";
4041                         r = -ENOMEM;
4042                         goto bad;
4043                 }
4044         }
4045
4046         ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
4047                         1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL);
4048         if (IS_ERR(ic->bufio)) {
4049                 r = PTR_ERR(ic->bufio);
4050                 ti->error = "Cannot initialize dm-bufio";
4051                 ic->bufio = NULL;
4052                 goto bad;
4053         }
4054         dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
4055
4056         if (ic->mode != 'R') {
4057                 r = create_journal(ic, &ti->error);
4058                 if (r)
4059                         goto bad;
4060
4061         }
4062
4063         if (ic->mode == 'B') {
4064                 unsigned i;
4065                 unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
4066
4067                 ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
4068                 if (!ic->recalc_bitmap) {
4069                         r = -ENOMEM;
4070                         goto bad;
4071                 }
4072                 ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
4073                 if (!ic->may_write_bitmap) {
4074                         r = -ENOMEM;
4075                         goto bad;
4076                 }
4077                 ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
4078                 if (!ic->bbs) {
4079                         r = -ENOMEM;
4080                         goto bad;
4081                 }
4082                 INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
4083                 for (i = 0; i < ic->n_bitmap_blocks; i++) {
4084                         struct bitmap_block_status *bbs = &ic->bbs[i];
4085                         unsigned sector, pl_index, pl_offset;
4086
4087                         INIT_WORK(&bbs->work, bitmap_block_work);
4088                         bbs->ic = ic;
4089                         bbs->idx = i;
4090                         bio_list_init(&bbs->bio_queue);
4091                         spin_lock_init(&bbs->bio_queue_lock);
4092
4093                         sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
4094                         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
4095                         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
4096
4097                         bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
4098                 }
4099         }
4100
4101         if (should_write_sb) {
4102                 int r;
4103
4104                 init_journal(ic, 0, ic->journal_sections, 0);
4105                 r = dm_integrity_failed(ic);
4106                 if (unlikely(r)) {
4107                         ti->error = "Error initializing journal";
4108                         goto bad;
4109                 }
4110                 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
4111                 if (r) {
4112                         ti->error = "Error initializing superblock";
4113                         goto bad;
4114                 }
4115                 ic->just_formatted = true;
4116         }
4117
4118         if (!ic->meta_dev) {
4119                 r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
4120                 if (r)
4121                         goto bad;
4122         }
4123         if (ic->mode == 'B') {
4124                 unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
4125                 if (!max_io_len)
4126                         max_io_len = 1U << 31;
4127                 DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
4128                 if (!ti->max_io_len || ti->max_io_len > max_io_len) {
4129                         r = dm_set_target_max_io_len(ti, max_io_len);
4130                         if (r)
4131                                 goto bad;
4132                 }
4133         }
4134
4135         if (!ic->internal_hash)
4136                 dm_integrity_set(ti, ic);
4137
4138         ti->num_flush_bios = 1;
4139         ti->flush_supported = true;
4140
4141         return 0;
4142
4143 bad:
4144         dm_integrity_dtr(ti);
4145         return r;
4146 }
4147
4148 static void dm_integrity_dtr(struct dm_target *ti)
4149 {
4150         struct dm_integrity_c *ic = ti->private;
4151
4152         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
4153         BUG_ON(!list_empty(&ic->wait_list));
4154
4155         if (ic->metadata_wq)
4156                 destroy_workqueue(ic->metadata_wq);
4157         if (ic->wait_wq)
4158                 destroy_workqueue(ic->wait_wq);
4159         if (ic->offload_wq)
4160                 destroy_workqueue(ic->offload_wq);
4161         if (ic->commit_wq)
4162                 destroy_workqueue(ic->commit_wq);
4163         if (ic->writer_wq)
4164                 destroy_workqueue(ic->writer_wq);
4165         if (ic->recalc_wq)
4166                 destroy_workqueue(ic->recalc_wq);
4167         vfree(ic->recalc_buffer);
4168         kvfree(ic->recalc_tags);
4169         kvfree(ic->bbs);
4170         if (ic->bufio)
4171                 dm_bufio_client_destroy(ic->bufio);
4172         mempool_exit(&ic->journal_io_mempool);
4173         if (ic->io)
4174                 dm_io_client_destroy(ic->io);
4175         if (ic->dev)
4176                 dm_put_device(ti, ic->dev);
4177         if (ic->meta_dev)
4178                 dm_put_device(ti, ic->meta_dev);
4179         dm_integrity_free_page_list(ic->journal);
4180         dm_integrity_free_page_list(ic->journal_io);
4181         dm_integrity_free_page_list(ic->journal_xor);
4182         dm_integrity_free_page_list(ic->recalc_bitmap);
4183         dm_integrity_free_page_list(ic->may_write_bitmap);
4184         if (ic->journal_scatterlist)
4185                 dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
4186         if (ic->journal_io_scatterlist)
4187                 dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
4188         if (ic->sk_requests) {
4189                 unsigned i;
4190
4191                 for (i = 0; i < ic->journal_sections; i++) {
4192                         struct skcipher_request *req = ic->sk_requests[i];
4193                         if (req) {
4194                                 kzfree(req->iv);
4195                                 skcipher_request_free(req);
4196                         }
4197                 }
4198                 kvfree(ic->sk_requests);
4199         }
4200         kvfree(ic->journal_tree);
4201         if (ic->sb)
4202                 free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
4203
4204         if (ic->internal_hash)
4205                 crypto_free_shash(ic->internal_hash);
4206         free_alg(&ic->internal_hash_alg);
4207
4208         if (ic->journal_crypt)
4209                 crypto_free_skcipher(ic->journal_crypt);
4210         free_alg(&ic->journal_crypt_alg);
4211
4212         if (ic->journal_mac)
4213                 crypto_free_shash(ic->journal_mac);
4214         free_alg(&ic->journal_mac_alg);
4215
4216         kfree(ic);
4217 }
4218
4219 static struct target_type integrity_target = {
4220         .name                   = "integrity",
4221         .version                = {1, 4, 0},
4222         .module                 = THIS_MODULE,
4223         .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
4224         .ctr                    = dm_integrity_ctr,
4225         .dtr                    = dm_integrity_dtr,
4226         .map                    = dm_integrity_map,
4227         .postsuspend            = dm_integrity_postsuspend,
4228         .resume                 = dm_integrity_resume,
4229         .status                 = dm_integrity_status,
4230         .iterate_devices        = dm_integrity_iterate_devices,
4231         .io_hints               = dm_integrity_io_hints,
4232 };
4233
4234 static int __init dm_integrity_init(void)
4235 {
4236         int r;
4237
4238         journal_io_cache = kmem_cache_create("integrity_journal_io",
4239                                              sizeof(struct journal_io), 0, 0, NULL);
4240         if (!journal_io_cache) {
4241                 DMERR("can't allocate journal io cache");
4242                 return -ENOMEM;
4243         }
4244
4245         r = dm_register_target(&integrity_target);
4246
4247         if (r < 0)
4248                 DMERR("register failed %d", r);
4249
4250         return r;
4251 }
4252
4253 static void __exit dm_integrity_exit(void)
4254 {
4255         dm_unregister_target(&integrity_target);
4256         kmem_cache_destroy(journal_io_cache);
4257 }
4258
4259 module_init(dm_integrity_init);
4260 module_exit(dm_integrity_exit);
4261
4262 MODULE_AUTHOR("Milan Broz");
4263 MODULE_AUTHOR("Mikulas Patocka");
4264 MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
4265 MODULE_LICENSE("GPL");