dm integrity: report provided data sectors in the status
[linux-2.6-microblaze.git] / drivers / md / dm-integrity.c
1 /*
2  * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
3  * Copyright (C) 2016-2017 Milan Broz
4  * Copyright (C) 2016-2017 Mikulas Patocka
5  *
6  * This file is released under the GPL.
7  */
8
9 #include <linux/compiler.h>
10 #include <linux/module.h>
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/vmalloc.h>
14 #include <linux/sort.h>
15 #include <linux/rbtree.h>
16 #include <linux/delay.h>
17 #include <linux/random.h>
18 #include <crypto/hash.h>
19 #include <crypto/skcipher.h>
20 #include <linux/async_tx.h>
21 #include <linux/dm-bufio.h>
22
23 #define DM_MSG_PREFIX "integrity"
24
25 #define DEFAULT_INTERLEAVE_SECTORS      32768
26 #define DEFAULT_JOURNAL_SIZE_FACTOR     7
27 #define DEFAULT_BUFFER_SECTORS          128
28 #define DEFAULT_JOURNAL_WATERMARK       50
29 #define DEFAULT_SYNC_MSEC               10000
30 #define DEFAULT_MAX_JOURNAL_SECTORS     131072
31 #define MIN_LOG2_INTERLEAVE_SECTORS     3
32 #define MAX_LOG2_INTERLEAVE_SECTORS     31
33 #define METADATA_WORKQUEUE_MAX_ACTIVE   16
34
35 /*
36  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
37  * so it should not be enabled in the official kernel
38  */
39 //#define DEBUG_PRINT
40 //#define INTERNAL_VERIFY
41
42 /*
43  * On disk structures
44  */
45
46 #define SB_MAGIC                        "integrt"
47 #define SB_VERSION                      1
48 #define SB_SECTORS                      8
49 #define MAX_SECTORS_PER_BLOCK           8
50
51 struct superblock {
52         __u8 magic[8];
53         __u8 version;
54         __u8 log2_interleave_sectors;
55         __u16 integrity_tag_size;
56         __u32 journal_sections;
57         __u64 provided_data_sectors;    /* userspace uses this value */
58         __u32 flags;
59         __u8 log2_sectors_per_block;
60 };
61
62 #define SB_FLAG_HAVE_JOURNAL_MAC        0x1
63
64 #define JOURNAL_ENTRY_ROUNDUP           8
65
66 typedef __u64 commit_id_t;
67 #define JOURNAL_MAC_PER_SECTOR          8
68
69 struct journal_entry {
70         union {
71                 struct {
72                         __u32 sector_lo;
73                         __u32 sector_hi;
74                 } s;
75                 __u64 sector;
76         } u;
77         commit_id_t last_bytes[0];
78         /* __u8 tag[0]; */
79 };
80
81 #define journal_entry_tag(ic, je)               ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
82
83 #if BITS_PER_LONG == 64
84 #define journal_entry_set_sector(je, x)         do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
85 #define journal_entry_get_sector(je)            le64_to_cpu((je)->u.sector)
86 #elif defined(CONFIG_LBDAF)
87 #define journal_entry_set_sector(je, x)         do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
88 #define journal_entry_get_sector(je)            le64_to_cpu((je)->u.sector)
89 #else
90 #define journal_entry_set_sector(je, x)         do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0)
91 #define journal_entry_get_sector(je)            le32_to_cpu((je)->u.s.sector_lo)
92 #endif
93 #define journal_entry_is_unused(je)             ((je)->u.s.sector_hi == cpu_to_le32(-1))
94 #define journal_entry_set_unused(je)            do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
95 #define journal_entry_is_inprogress(je)         ((je)->u.s.sector_hi == cpu_to_le32(-2))
96 #define journal_entry_set_inprogress(je)        do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
97
98 #define JOURNAL_BLOCK_SECTORS           8
99 #define JOURNAL_SECTOR_DATA             ((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
100 #define JOURNAL_MAC_SIZE                (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
101
102 struct journal_sector {
103         __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
104         __u8 mac[JOURNAL_MAC_PER_SECTOR];
105         commit_id_t commit_id;
106 };
107
108 #define MAX_TAG_SIZE                    (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
109
110 #define METADATA_PADDING_SECTORS        8
111
112 #define N_COMMIT_IDS                    4
113
114 static unsigned char prev_commit_seq(unsigned char seq)
115 {
116         return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
117 }
118
119 static unsigned char next_commit_seq(unsigned char seq)
120 {
121         return (seq + 1) % N_COMMIT_IDS;
122 }
123
124 /*
125  * In-memory structures
126  */
127
128 struct journal_node {
129         struct rb_node node;
130         sector_t sector;
131 };
132
133 struct alg_spec {
134         char *alg_string;
135         char *key_string;
136         __u8 *key;
137         unsigned key_size;
138 };
139
140 struct dm_integrity_c {
141         struct dm_dev *dev;
142         unsigned tag_size;
143         __s8 log2_tag_size;
144         sector_t start;
145         mempool_t journal_io_mempool;
146         struct dm_io_client *io;
147         struct dm_bufio_client *bufio;
148         struct workqueue_struct *metadata_wq;
149         struct superblock *sb;
150         unsigned journal_pages;
151         struct page_list *journal;
152         struct page_list *journal_io;
153         struct page_list *journal_xor;
154
155         struct crypto_skcipher *journal_crypt;
156         struct scatterlist **journal_scatterlist;
157         struct scatterlist **journal_io_scatterlist;
158         struct skcipher_request **sk_requests;
159
160         struct crypto_shash *journal_mac;
161
162         struct journal_node *journal_tree;
163         struct rb_root journal_tree_root;
164
165         sector_t provided_data_sectors;
166
167         unsigned short journal_entry_size;
168         unsigned char journal_entries_per_sector;
169         unsigned char journal_section_entries;
170         unsigned short journal_section_sectors;
171         unsigned journal_sections;
172         unsigned journal_entries;
173         sector_t device_sectors;
174         unsigned initial_sectors;
175         unsigned metadata_run;
176         __s8 log2_metadata_run;
177         __u8 log2_buffer_sectors;
178         __u8 sectors_per_block;
179
180         unsigned char mode;
181         int suspending;
182
183         int failed;
184
185         struct crypto_shash *internal_hash;
186
187         /* these variables are locked with endio_wait.lock */
188         struct rb_root in_progress;
189         struct list_head wait_list;
190         wait_queue_head_t endio_wait;
191         struct workqueue_struct *wait_wq;
192
193         unsigned char commit_seq;
194         commit_id_t commit_ids[N_COMMIT_IDS];
195
196         unsigned committed_section;
197         unsigned n_committed_sections;
198
199         unsigned uncommitted_section;
200         unsigned n_uncommitted_sections;
201
202         unsigned free_section;
203         unsigned char free_section_entry;
204         unsigned free_sectors;
205
206         unsigned free_sectors_threshold;
207
208         struct workqueue_struct *commit_wq;
209         struct work_struct commit_work;
210
211         struct workqueue_struct *writer_wq;
212         struct work_struct writer_work;
213
214         struct bio_list flush_bio_list;
215
216         unsigned long autocommit_jiffies;
217         struct timer_list autocommit_timer;
218         unsigned autocommit_msec;
219
220         wait_queue_head_t copy_to_journal_wait;
221
222         struct completion crypto_backoff;
223
224         bool journal_uptodate;
225         bool just_formatted;
226
227         struct alg_spec internal_hash_alg;
228         struct alg_spec journal_crypt_alg;
229         struct alg_spec journal_mac_alg;
230
231         atomic64_t number_of_mismatches;
232 };
233
234 struct dm_integrity_range {
235         sector_t logical_sector;
236         unsigned n_sectors;
237         bool waiting;
238         union {
239                 struct rb_node node;
240                 struct {
241                         struct task_struct *task;
242                         struct list_head wait_entry;
243                 };
244         };
245 };
246
247 struct dm_integrity_io {
248         struct work_struct work;
249
250         struct dm_integrity_c *ic;
251         bool write;
252         bool fua;
253
254         struct dm_integrity_range range;
255
256         sector_t metadata_block;
257         unsigned metadata_offset;
258
259         atomic_t in_flight;
260         blk_status_t bi_status;
261
262         struct completion *completion;
263
264         struct gendisk *orig_bi_disk;
265         u8 orig_bi_partno;
266         bio_end_io_t *orig_bi_end_io;
267         struct bio_integrity_payload *orig_bi_integrity;
268         struct bvec_iter orig_bi_iter;
269 };
270
271 struct journal_completion {
272         struct dm_integrity_c *ic;
273         atomic_t in_flight;
274         struct completion comp;
275 };
276
277 struct journal_io {
278         struct dm_integrity_range range;
279         struct journal_completion *comp;
280 };
281
282 static struct kmem_cache *journal_io_cache;
283
284 #define JOURNAL_IO_MEMPOOL      32
285
286 #ifdef DEBUG_PRINT
287 #define DEBUG_print(x, ...)     printk(KERN_DEBUG x, ##__VA_ARGS__)
288 static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
289 {
290         va_list args;
291         va_start(args, msg);
292         vprintk(msg, args);
293         va_end(args);
294         if (len)
295                 pr_cont(":");
296         while (len) {
297                 pr_cont(" %02x", *bytes);
298                 bytes++;
299                 len--;
300         }
301         pr_cont("\n");
302 }
303 #define DEBUG_bytes(bytes, len, msg, ...)       __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
304 #else
305 #define DEBUG_print(x, ...)                     do { } while (0)
306 #define DEBUG_bytes(bytes, len, msg, ...)       do { } while (0)
307 #endif
308
309 /*
310  * DM Integrity profile, protection is performed layer above (dm-crypt)
311  */
312 static const struct blk_integrity_profile dm_integrity_profile = {
313         .name                   = "DM-DIF-EXT-TAG",
314         .generate_fn            = NULL,
315         .verify_fn              = NULL,
316 };
317
318 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
319 static void integrity_bio_wait(struct work_struct *w);
320 static void dm_integrity_dtr(struct dm_target *ti);
321
322 static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
323 {
324         if (err == -EILSEQ)
325                 atomic64_inc(&ic->number_of_mismatches);
326         if (!cmpxchg(&ic->failed, 0, err))
327                 DMERR("Error on %s: %d", msg, err);
328 }
329
330 static int dm_integrity_failed(struct dm_integrity_c *ic)
331 {
332         return READ_ONCE(ic->failed);
333 }
334
335 static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
336                                           unsigned j, unsigned char seq)
337 {
338         /*
339          * Xor the number with section and sector, so that if a piece of
340          * journal is written at wrong place, it is detected.
341          */
342         return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
343 }
344
345 static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
346                                 sector_t *area, sector_t *offset)
347 {
348         __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
349
350         *area = data_sector >> log2_interleave_sectors;
351         *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
352 }
353
354 #define sector_to_block(ic, n)                                          \
355 do {                                                                    \
356         BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1));          \
357         (n) >>= (ic)->sb->log2_sectors_per_block;                       \
358 } while (0)
359
360 static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
361                                             sector_t offset, unsigned *metadata_offset)
362 {
363         __u64 ms;
364         unsigned mo;
365
366         ms = area << ic->sb->log2_interleave_sectors;
367         if (likely(ic->log2_metadata_run >= 0))
368                 ms += area << ic->log2_metadata_run;
369         else
370                 ms += area * ic->metadata_run;
371         ms >>= ic->log2_buffer_sectors;
372
373         sector_to_block(ic, offset);
374
375         if (likely(ic->log2_tag_size >= 0)) {
376                 ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
377                 mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
378         } else {
379                 ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
380                 mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
381         }
382         *metadata_offset = mo;
383         return ms;
384 }
385
386 static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
387 {
388         sector_t result;
389
390         result = area << ic->sb->log2_interleave_sectors;
391         if (likely(ic->log2_metadata_run >= 0))
392                 result += (area + 1) << ic->log2_metadata_run;
393         else
394                 result += (area + 1) * ic->metadata_run;
395
396         result += (sector_t)ic->initial_sectors + offset;
397         return result;
398 }
399
400 static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
401 {
402         if (unlikely(*sec_ptr >= ic->journal_sections))
403                 *sec_ptr -= ic->journal_sections;
404 }
405
406 static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
407 {
408         struct dm_io_request io_req;
409         struct dm_io_region io_loc;
410
411         io_req.bi_op = op;
412         io_req.bi_op_flags = op_flags;
413         io_req.mem.type = DM_IO_KMEM;
414         io_req.mem.ptr.addr = ic->sb;
415         io_req.notify.fn = NULL;
416         io_req.client = ic->io;
417         io_loc.bdev = ic->dev->bdev;
418         io_loc.sector = ic->start;
419         io_loc.count = SB_SECTORS;
420
421         return dm_io(&io_req, 1, &io_loc, NULL);
422 }
423
424 static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
425                                  bool e, const char *function)
426 {
427 #if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
428         unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
429
430         if (unlikely(section >= ic->journal_sections) ||
431             unlikely(offset >= limit)) {
432                 printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
433                         function, section, offset, ic->journal_sections, limit);
434                 BUG();
435         }
436 #endif
437 }
438
439 static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
440                                unsigned *pl_index, unsigned *pl_offset)
441 {
442         unsigned sector;
443
444         access_journal_check(ic, section, offset, false, "page_list_location");
445
446         sector = section * ic->journal_section_sectors + offset;
447
448         *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
449         *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
450 }
451
452 static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
453                                                unsigned section, unsigned offset, unsigned *n_sectors)
454 {
455         unsigned pl_index, pl_offset;
456         char *va;
457
458         page_list_location(ic, section, offset, &pl_index, &pl_offset);
459
460         if (n_sectors)
461                 *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
462
463         va = lowmem_page_address(pl[pl_index].page);
464
465         return (struct journal_sector *)(va + pl_offset);
466 }
467
468 static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
469 {
470         return access_page_list(ic, ic->journal, section, offset, NULL);
471 }
472
473 static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
474 {
475         unsigned rel_sector, offset;
476         struct journal_sector *js;
477
478         access_journal_check(ic, section, n, true, "access_journal_entry");
479
480         rel_sector = n % JOURNAL_BLOCK_SECTORS;
481         offset = n / JOURNAL_BLOCK_SECTORS;
482
483         js = access_journal(ic, section, rel_sector);
484         return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
485 }
486
487 static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
488 {
489         n <<= ic->sb->log2_sectors_per_block;
490
491         n += JOURNAL_BLOCK_SECTORS;
492
493         access_journal_check(ic, section, n, false, "access_journal_data");
494
495         return access_journal(ic, section, n);
496 }
497
498 static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
499 {
500         SHASH_DESC_ON_STACK(desc, ic->journal_mac);
501         int r;
502         unsigned j, size;
503
504         desc->tfm = ic->journal_mac;
505         desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
506
507         r = crypto_shash_init(desc);
508         if (unlikely(r)) {
509                 dm_integrity_io_error(ic, "crypto_shash_init", r);
510                 goto err;
511         }
512
513         for (j = 0; j < ic->journal_section_entries; j++) {
514                 struct journal_entry *je = access_journal_entry(ic, section, j);
515                 r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
516                 if (unlikely(r)) {
517                         dm_integrity_io_error(ic, "crypto_shash_update", r);
518                         goto err;
519                 }
520         }
521
522         size = crypto_shash_digestsize(ic->journal_mac);
523
524         if (likely(size <= JOURNAL_MAC_SIZE)) {
525                 r = crypto_shash_final(desc, result);
526                 if (unlikely(r)) {
527                         dm_integrity_io_error(ic, "crypto_shash_final", r);
528                         goto err;
529                 }
530                 memset(result + size, 0, JOURNAL_MAC_SIZE - size);
531         } else {
532                 __u8 digest[size];
533                 r = crypto_shash_final(desc, digest);
534                 if (unlikely(r)) {
535                         dm_integrity_io_error(ic, "crypto_shash_final", r);
536                         goto err;
537                 }
538                 memcpy(result, digest, JOURNAL_MAC_SIZE);
539         }
540
541         return;
542 err:
543         memset(result, 0, JOURNAL_MAC_SIZE);
544 }
545
546 static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
547 {
548         __u8 result[JOURNAL_MAC_SIZE];
549         unsigned j;
550
551         if (!ic->journal_mac)
552                 return;
553
554         section_mac(ic, section, result);
555
556         for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
557                 struct journal_sector *js = access_journal(ic, section, j);
558
559                 if (likely(wr))
560                         memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
561                 else {
562                         if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
563                                 dm_integrity_io_error(ic, "journal mac", -EILSEQ);
564                 }
565         }
566 }
567
568 static void complete_journal_op(void *context)
569 {
570         struct journal_completion *comp = context;
571         BUG_ON(!atomic_read(&comp->in_flight));
572         if (likely(atomic_dec_and_test(&comp->in_flight)))
573                 complete(&comp->comp);
574 }
575
576 static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
577                         unsigned n_sections, struct journal_completion *comp)
578 {
579         struct async_submit_ctl submit;
580         size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
581         unsigned pl_index, pl_offset, section_index;
582         struct page_list *source_pl, *target_pl;
583
584         if (likely(encrypt)) {
585                 source_pl = ic->journal;
586                 target_pl = ic->journal_io;
587         } else {
588                 source_pl = ic->journal_io;
589                 target_pl = ic->journal;
590         }
591
592         page_list_location(ic, section, 0, &pl_index, &pl_offset);
593
594         atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
595
596         init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
597
598         section_index = pl_index;
599
600         do {
601                 size_t this_step;
602                 struct page *src_pages[2];
603                 struct page *dst_page;
604
605                 while (unlikely(pl_index == section_index)) {
606                         unsigned dummy;
607                         if (likely(encrypt))
608                                 rw_section_mac(ic, section, true);
609                         section++;
610                         n_sections--;
611                         if (!n_sections)
612                                 break;
613                         page_list_location(ic, section, 0, &section_index, &dummy);
614                 }
615
616                 this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
617                 dst_page = target_pl[pl_index].page;
618                 src_pages[0] = source_pl[pl_index].page;
619                 src_pages[1] = ic->journal_xor[pl_index].page;
620
621                 async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
622
623                 pl_index++;
624                 pl_offset = 0;
625                 n_bytes -= this_step;
626         } while (n_bytes);
627
628         BUG_ON(n_sections);
629
630         async_tx_issue_pending_all();
631 }
632
633 static void complete_journal_encrypt(struct crypto_async_request *req, int err)
634 {
635         struct journal_completion *comp = req->data;
636         if (unlikely(err)) {
637                 if (likely(err == -EINPROGRESS)) {
638                         complete(&comp->ic->crypto_backoff);
639                         return;
640                 }
641                 dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
642         }
643         complete_journal_op(comp);
644 }
645
646 static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
647 {
648         int r;
649         skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
650                                       complete_journal_encrypt, comp);
651         if (likely(encrypt))
652                 r = crypto_skcipher_encrypt(req);
653         else
654                 r = crypto_skcipher_decrypt(req);
655         if (likely(!r))
656                 return false;
657         if (likely(r == -EINPROGRESS))
658                 return true;
659         if (likely(r == -EBUSY)) {
660                 wait_for_completion(&comp->ic->crypto_backoff);
661                 reinit_completion(&comp->ic->crypto_backoff);
662                 return true;
663         }
664         dm_integrity_io_error(comp->ic, "encrypt", r);
665         return false;
666 }
667
668 static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
669                           unsigned n_sections, struct journal_completion *comp)
670 {
671         struct scatterlist **source_sg;
672         struct scatterlist **target_sg;
673
674         atomic_add(2, &comp->in_flight);
675
676         if (likely(encrypt)) {
677                 source_sg = ic->journal_scatterlist;
678                 target_sg = ic->journal_io_scatterlist;
679         } else {
680                 source_sg = ic->journal_io_scatterlist;
681                 target_sg = ic->journal_scatterlist;
682         }
683
684         do {
685                 struct skcipher_request *req;
686                 unsigned ivsize;
687                 char *iv;
688
689                 if (likely(encrypt))
690                         rw_section_mac(ic, section, true);
691
692                 req = ic->sk_requests[section];
693                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
694                 iv = req->iv;
695
696                 memcpy(iv, iv + ivsize, ivsize);
697
698                 req->src = source_sg[section];
699                 req->dst = target_sg[section];
700
701                 if (unlikely(do_crypt(encrypt, req, comp)))
702                         atomic_inc(&comp->in_flight);
703
704                 section++;
705                 n_sections--;
706         } while (n_sections);
707
708         atomic_dec(&comp->in_flight);
709         complete_journal_op(comp);
710 }
711
712 static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
713                             unsigned n_sections, struct journal_completion *comp)
714 {
715         if (ic->journal_xor)
716                 return xor_journal(ic, encrypt, section, n_sections, comp);
717         else
718                 return crypt_journal(ic, encrypt, section, n_sections, comp);
719 }
720
721 static void complete_journal_io(unsigned long error, void *context)
722 {
723         struct journal_completion *comp = context;
724         if (unlikely(error != 0))
725                 dm_integrity_io_error(comp->ic, "writing journal", -EIO);
726         complete_journal_op(comp);
727 }
728
729 static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
730                        unsigned n_sections, struct journal_completion *comp)
731 {
732         struct dm_io_request io_req;
733         struct dm_io_region io_loc;
734         unsigned sector, n_sectors, pl_index, pl_offset;
735         int r;
736
737         if (unlikely(dm_integrity_failed(ic))) {
738                 if (comp)
739                         complete_journal_io(-1UL, comp);
740                 return;
741         }
742
743         sector = section * ic->journal_section_sectors;
744         n_sectors = n_sections * ic->journal_section_sectors;
745
746         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
747         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
748
749         io_req.bi_op = op;
750         io_req.bi_op_flags = op_flags;
751         io_req.mem.type = DM_IO_PAGE_LIST;
752         if (ic->journal_io)
753                 io_req.mem.ptr.pl = &ic->journal_io[pl_index];
754         else
755                 io_req.mem.ptr.pl = &ic->journal[pl_index];
756         io_req.mem.offset = pl_offset;
757         if (likely(comp != NULL)) {
758                 io_req.notify.fn = complete_journal_io;
759                 io_req.notify.context = comp;
760         } else {
761                 io_req.notify.fn = NULL;
762         }
763         io_req.client = ic->io;
764         io_loc.bdev = ic->dev->bdev;
765         io_loc.sector = ic->start + SB_SECTORS + sector;
766         io_loc.count = n_sectors;
767
768         r = dm_io(&io_req, 1, &io_loc, NULL);
769         if (unlikely(r)) {
770                 dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
771                 if (comp) {
772                         WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
773                         complete_journal_io(-1UL, comp);
774                 }
775         }
776 }
777
778 static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
779 {
780         struct journal_completion io_comp;
781         struct journal_completion crypt_comp_1;
782         struct journal_completion crypt_comp_2;
783         unsigned i;
784
785         io_comp.ic = ic;
786         init_completion(&io_comp.comp);
787
788         if (commit_start + commit_sections <= ic->journal_sections) {
789                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
790                 if (ic->journal_io) {
791                         crypt_comp_1.ic = ic;
792                         init_completion(&crypt_comp_1.comp);
793                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
794                         encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
795                         wait_for_completion_io(&crypt_comp_1.comp);
796                 } else {
797                         for (i = 0; i < commit_sections; i++)
798                                 rw_section_mac(ic, commit_start + i, true);
799                 }
800                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start,
801                            commit_sections, &io_comp);
802         } else {
803                 unsigned to_end;
804                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
805                 to_end = ic->journal_sections - commit_start;
806                 if (ic->journal_io) {
807                         crypt_comp_1.ic = ic;
808                         init_completion(&crypt_comp_1.comp);
809                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
810                         encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
811                         if (try_wait_for_completion(&crypt_comp_1.comp)) {
812                                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
813                                 reinit_completion(&crypt_comp_1.comp);
814                                 crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
815                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
816                                 wait_for_completion_io(&crypt_comp_1.comp);
817                         } else {
818                                 crypt_comp_2.ic = ic;
819                                 init_completion(&crypt_comp_2.comp);
820                                 crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
821                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
822                                 wait_for_completion_io(&crypt_comp_1.comp);
823                                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
824                                 wait_for_completion_io(&crypt_comp_2.comp);
825                         }
826                 } else {
827                         for (i = 0; i < to_end; i++)
828                                 rw_section_mac(ic, commit_start + i, true);
829                         rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
830                         for (i = 0; i < commit_sections - to_end; i++)
831                                 rw_section_mac(ic, i, true);
832                 }
833                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
834         }
835
836         wait_for_completion_io(&io_comp.comp);
837 }
838
839 static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
840                               unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
841 {
842         struct dm_io_request io_req;
843         struct dm_io_region io_loc;
844         int r;
845         unsigned sector, pl_index, pl_offset;
846
847         BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
848
849         if (unlikely(dm_integrity_failed(ic))) {
850                 fn(-1UL, data);
851                 return;
852         }
853
854         sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
855
856         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
857         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
858
859         io_req.bi_op = REQ_OP_WRITE;
860         io_req.bi_op_flags = 0;
861         io_req.mem.type = DM_IO_PAGE_LIST;
862         io_req.mem.ptr.pl = &ic->journal[pl_index];
863         io_req.mem.offset = pl_offset;
864         io_req.notify.fn = fn;
865         io_req.notify.context = data;
866         io_req.client = ic->io;
867         io_loc.bdev = ic->dev->bdev;
868         io_loc.sector = ic->start + target;
869         io_loc.count = n_sectors;
870
871         r = dm_io(&io_req, 1, &io_loc, NULL);
872         if (unlikely(r)) {
873                 WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
874                 fn(-1UL, data);
875         }
876 }
877
878 static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
879 {
880         return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
881                range2->logical_sector + range2->n_sectors > range2->logical_sector;
882 }
883
884 static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
885 {
886         struct rb_node **n = &ic->in_progress.rb_node;
887         struct rb_node *parent;
888
889         BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
890
891         if (likely(check_waiting)) {
892                 struct dm_integrity_range *range;
893                 list_for_each_entry(range, &ic->wait_list, wait_entry) {
894                         if (unlikely(ranges_overlap(range, new_range)))
895                                 return false;
896                 }
897         }
898
899         parent = NULL;
900
901         while (*n) {
902                 struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
903
904                 parent = *n;
905                 if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
906                         n = &range->node.rb_left;
907                 } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
908                         n = &range->node.rb_right;
909                 } else {
910                         return false;
911                 }
912         }
913
914         rb_link_node(&new_range->node, parent, n);
915         rb_insert_color(&new_range->node, &ic->in_progress);
916
917         return true;
918 }
919
920 static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
921 {
922         rb_erase(&range->node, &ic->in_progress);
923         while (unlikely(!list_empty(&ic->wait_list))) {
924                 struct dm_integrity_range *last_range =
925                         list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
926                 struct task_struct *last_range_task;
927                 if (!ranges_overlap(range, last_range))
928                         break;
929                 last_range_task = last_range->task;
930                 list_del(&last_range->wait_entry);
931                 if (!add_new_range(ic, last_range, false)) {
932                         last_range->task = last_range_task;
933                         list_add(&last_range->wait_entry, &ic->wait_list);
934                         break;
935                 }
936                 last_range->waiting = false;
937                 wake_up_process(last_range_task);
938         }
939 }
940
941 static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
942 {
943         unsigned long flags;
944
945         spin_lock_irqsave(&ic->endio_wait.lock, flags);
946         remove_range_unlocked(ic, range);
947         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
948 }
949
950 static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
951 {
952         new_range->waiting = true;
953         list_add_tail(&new_range->wait_entry, &ic->wait_list);
954         new_range->task = current;
955         do {
956                 __set_current_state(TASK_UNINTERRUPTIBLE);
957                 spin_unlock_irq(&ic->endio_wait.lock);
958                 io_schedule();
959                 spin_lock_irq(&ic->endio_wait.lock);
960         } while (unlikely(new_range->waiting));
961 }
962
963 static void init_journal_node(struct journal_node *node)
964 {
965         RB_CLEAR_NODE(&node->node);
966         node->sector = (sector_t)-1;
967 }
968
969 static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
970 {
971         struct rb_node **link;
972         struct rb_node *parent;
973
974         node->sector = sector;
975         BUG_ON(!RB_EMPTY_NODE(&node->node));
976
977         link = &ic->journal_tree_root.rb_node;
978         parent = NULL;
979
980         while (*link) {
981                 struct journal_node *j;
982                 parent = *link;
983                 j = container_of(parent, struct journal_node, node);
984                 if (sector < j->sector)
985                         link = &j->node.rb_left;
986                 else
987                         link = &j->node.rb_right;
988         }
989
990         rb_link_node(&node->node, parent, link);
991         rb_insert_color(&node->node, &ic->journal_tree_root);
992 }
993
994 static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
995 {
996         BUG_ON(RB_EMPTY_NODE(&node->node));
997         rb_erase(&node->node, &ic->journal_tree_root);
998         init_journal_node(node);
999 }
1000
1001 #define NOT_FOUND       (-1U)
1002
1003 static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
1004 {
1005         struct rb_node *n = ic->journal_tree_root.rb_node;
1006         unsigned found = NOT_FOUND;
1007         *next_sector = (sector_t)-1;
1008         while (n) {
1009                 struct journal_node *j = container_of(n, struct journal_node, node);
1010                 if (sector == j->sector) {
1011                         found = j - ic->journal_tree;
1012                 }
1013                 if (sector < j->sector) {
1014                         *next_sector = j->sector;
1015                         n = j->node.rb_left;
1016                 } else {
1017                         n = j->node.rb_right;
1018                 }
1019         }
1020
1021         return found;
1022 }
1023
1024 static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
1025 {
1026         struct journal_node *node, *next_node;
1027         struct rb_node *next;
1028
1029         if (unlikely(pos >= ic->journal_entries))
1030                 return false;
1031         node = &ic->journal_tree[pos];
1032         if (unlikely(RB_EMPTY_NODE(&node->node)))
1033                 return false;
1034         if (unlikely(node->sector != sector))
1035                 return false;
1036
1037         next = rb_next(&node->node);
1038         if (unlikely(!next))
1039                 return true;
1040
1041         next_node = container_of(next, struct journal_node, node);
1042         return next_node->sector != sector;
1043 }
1044
1045 static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
1046 {
1047         struct rb_node *next;
1048         struct journal_node *next_node;
1049         unsigned next_section;
1050
1051         BUG_ON(RB_EMPTY_NODE(&node->node));
1052
1053         next = rb_next(&node->node);
1054         if (unlikely(!next))
1055                 return false;
1056
1057         next_node = container_of(next, struct journal_node, node);
1058
1059         if (next_node->sector != node->sector)
1060                 return false;
1061
1062         next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
1063         if (next_section >= ic->committed_section &&
1064             next_section < ic->committed_section + ic->n_committed_sections)
1065                 return true;
1066         if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
1067                 return true;
1068
1069         return false;
1070 }
1071
1072 #define TAG_READ        0
1073 #define TAG_WRITE       1
1074 #define TAG_CMP         2
1075
1076 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
1077                                unsigned *metadata_offset, unsigned total_size, int op)
1078 {
1079         do {
1080                 unsigned char *data, *dp;
1081                 struct dm_buffer *b;
1082                 unsigned to_copy;
1083                 int r;
1084
1085                 r = dm_integrity_failed(ic);
1086                 if (unlikely(r))
1087                         return r;
1088
1089                 data = dm_bufio_read(ic->bufio, *metadata_block, &b);
1090                 if (unlikely(IS_ERR(data)))
1091                         return PTR_ERR(data);
1092
1093                 to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
1094                 dp = data + *metadata_offset;
1095                 if (op == TAG_READ) {
1096                         memcpy(tag, dp, to_copy);
1097                 } else if (op == TAG_WRITE) {
1098                         memcpy(dp, tag, to_copy);
1099                         dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
1100                 } else  {
1101                         /* e.g.: op == TAG_CMP */
1102                         if (unlikely(memcmp(dp, tag, to_copy))) {
1103                                 unsigned i;
1104
1105                                 for (i = 0; i < to_copy; i++) {
1106                                         if (dp[i] != tag[i])
1107                                                 break;
1108                                         total_size--;
1109                                 }
1110                                 dm_bufio_release(b);
1111                                 return total_size;
1112                         }
1113                 }
1114                 dm_bufio_release(b);
1115
1116                 tag += to_copy;
1117                 *metadata_offset += to_copy;
1118                 if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
1119                         (*metadata_block)++;
1120                         *metadata_offset = 0;
1121                 }
1122                 total_size -= to_copy;
1123         } while (unlikely(total_size));
1124
1125         return 0;
1126 }
1127
1128 static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
1129 {
1130         int r;
1131         r = dm_bufio_write_dirty_buffers(ic->bufio);
1132         if (unlikely(r))
1133                 dm_integrity_io_error(ic, "writing tags", r);
1134 }
1135
1136 static void sleep_on_endio_wait(struct dm_integrity_c *ic)
1137 {
1138         DECLARE_WAITQUEUE(wait, current);
1139         __add_wait_queue(&ic->endio_wait, &wait);
1140         __set_current_state(TASK_UNINTERRUPTIBLE);
1141         spin_unlock_irq(&ic->endio_wait.lock);
1142         io_schedule();
1143         spin_lock_irq(&ic->endio_wait.lock);
1144         __remove_wait_queue(&ic->endio_wait, &wait);
1145 }
1146
1147 static void autocommit_fn(struct timer_list *t)
1148 {
1149         struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer);
1150
1151         if (likely(!dm_integrity_failed(ic)))
1152                 queue_work(ic->commit_wq, &ic->commit_work);
1153 }
1154
1155 static void schedule_autocommit(struct dm_integrity_c *ic)
1156 {
1157         if (!timer_pending(&ic->autocommit_timer))
1158                 mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
1159 }
1160
1161 static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1162 {
1163         struct bio *bio;
1164         unsigned long flags;
1165
1166         spin_lock_irqsave(&ic->endio_wait.lock, flags);
1167         bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1168         bio_list_add(&ic->flush_bio_list, bio);
1169         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1170
1171         queue_work(ic->commit_wq, &ic->commit_work);
1172 }
1173
1174 static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
1175 {
1176         int r = dm_integrity_failed(ic);
1177         if (unlikely(r) && !bio->bi_status)
1178                 bio->bi_status = errno_to_blk_status(r);
1179         bio_endio(bio);
1180 }
1181
1182 static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1183 {
1184         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1185
1186         if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
1187                 submit_flush_bio(ic, dio);
1188         else
1189                 do_endio(ic, bio);
1190 }
1191
1192 static void dec_in_flight(struct dm_integrity_io *dio)
1193 {
1194         if (atomic_dec_and_test(&dio->in_flight)) {
1195                 struct dm_integrity_c *ic = dio->ic;
1196                 struct bio *bio;
1197
1198                 remove_range(ic, &dio->range);
1199
1200                 if (unlikely(dio->write))
1201                         schedule_autocommit(ic);
1202
1203                 bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1204
1205                 if (unlikely(dio->bi_status) && !bio->bi_status)
1206                         bio->bi_status = dio->bi_status;
1207                 if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
1208                         dio->range.logical_sector += dio->range.n_sectors;
1209                         bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
1210                         INIT_WORK(&dio->work, integrity_bio_wait);
1211                         queue_work(ic->wait_wq, &dio->work);
1212                         return;
1213                 }
1214                 do_endio_flush(ic, dio);
1215         }
1216 }
1217
1218 static void integrity_end_io(struct bio *bio)
1219 {
1220         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1221
1222         bio->bi_iter = dio->orig_bi_iter;
1223         bio->bi_disk = dio->orig_bi_disk;
1224         bio->bi_partno = dio->orig_bi_partno;
1225         if (dio->orig_bi_integrity) {
1226                 bio->bi_integrity = dio->orig_bi_integrity;
1227                 bio->bi_opf |= REQ_INTEGRITY;
1228         }
1229         bio->bi_end_io = dio->orig_bi_end_io;
1230
1231         if (dio->completion)
1232                 complete(dio->completion);
1233
1234         dec_in_flight(dio);
1235 }
1236
1237 static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
1238                                       const char *data, char *result)
1239 {
1240         __u64 sector_le = cpu_to_le64(sector);
1241         SHASH_DESC_ON_STACK(req, ic->internal_hash);
1242         int r;
1243         unsigned digest_size;
1244
1245         req->tfm = ic->internal_hash;
1246         req->flags = 0;
1247
1248         r = crypto_shash_init(req);
1249         if (unlikely(r < 0)) {
1250                 dm_integrity_io_error(ic, "crypto_shash_init", r);
1251                 goto failed;
1252         }
1253
1254         r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
1255         if (unlikely(r < 0)) {
1256                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1257                 goto failed;
1258         }
1259
1260         r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
1261         if (unlikely(r < 0)) {
1262                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1263                 goto failed;
1264         }
1265
1266         r = crypto_shash_final(req, result);
1267         if (unlikely(r < 0)) {
1268                 dm_integrity_io_error(ic, "crypto_shash_final", r);
1269                 goto failed;
1270         }
1271
1272         digest_size = crypto_shash_digestsize(ic->internal_hash);
1273         if (unlikely(digest_size < ic->tag_size))
1274                 memset(result + digest_size, 0, ic->tag_size - digest_size);
1275
1276         return;
1277
1278 failed:
1279         /* this shouldn't happen anyway, the hash functions have no reason to fail */
1280         get_random_bytes(result, ic->tag_size);
1281 }
1282
1283 static void integrity_metadata(struct work_struct *w)
1284 {
1285         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1286         struct dm_integrity_c *ic = dio->ic;
1287
1288         int r;
1289
1290         if (ic->internal_hash) {
1291                 struct bvec_iter iter;
1292                 struct bio_vec bv;
1293                 unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1294                 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1295                 char *checksums;
1296                 unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
1297                 char checksums_onstack[ic->tag_size + extra_space];
1298                 unsigned sectors_to_process = dio->range.n_sectors;
1299                 sector_t sector = dio->range.logical_sector;
1300
1301                 if (unlikely(ic->mode == 'R'))
1302                         goto skip_io;
1303
1304                 checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
1305                                     GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
1306                 if (!checksums)
1307                         checksums = checksums_onstack;
1308
1309                 __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
1310                         unsigned pos;
1311                         char *mem, *checksums_ptr;
1312
1313 again:
1314                         mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
1315                         pos = 0;
1316                         checksums_ptr = checksums;
1317                         do {
1318                                 integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
1319                                 checksums_ptr += ic->tag_size;
1320                                 sectors_to_process -= ic->sectors_per_block;
1321                                 pos += ic->sectors_per_block << SECTOR_SHIFT;
1322                                 sector += ic->sectors_per_block;
1323                         } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
1324                         kunmap_atomic(mem);
1325
1326                         r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
1327                                                 checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
1328                         if (unlikely(r)) {
1329                                 if (r > 0) {
1330                                         DMERR("Checksum failed at sector 0x%llx",
1331                                               (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
1332                                         r = -EILSEQ;
1333                                         atomic64_inc(&ic->number_of_mismatches);
1334                                 }
1335                                 if (likely(checksums != checksums_onstack))
1336                                         kfree(checksums);
1337                                 goto error;
1338                         }
1339
1340                         if (!sectors_to_process)
1341                                 break;
1342
1343                         if (unlikely(pos < bv.bv_len)) {
1344                                 bv.bv_offset += pos;
1345                                 bv.bv_len -= pos;
1346                                 goto again;
1347                         }
1348                 }
1349
1350                 if (likely(checksums != checksums_onstack))
1351                         kfree(checksums);
1352         } else {
1353                 struct bio_integrity_payload *bip = dio->orig_bi_integrity;
1354
1355                 if (bip) {
1356                         struct bio_vec biv;
1357                         struct bvec_iter iter;
1358                         unsigned data_to_process = dio->range.n_sectors;
1359                         sector_to_block(ic, data_to_process);
1360                         data_to_process *= ic->tag_size;
1361
1362                         bip_for_each_vec(biv, bip, iter) {
1363                                 unsigned char *tag;
1364                                 unsigned this_len;
1365
1366                                 BUG_ON(PageHighMem(biv.bv_page));
1367                                 tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1368                                 this_len = min(biv.bv_len, data_to_process);
1369                                 r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
1370                                                         this_len, !dio->write ? TAG_READ : TAG_WRITE);
1371                                 if (unlikely(r))
1372                                         goto error;
1373                                 data_to_process -= this_len;
1374                                 if (!data_to_process)
1375                                         break;
1376                         }
1377                 }
1378         }
1379 skip_io:
1380         dec_in_flight(dio);
1381         return;
1382 error:
1383         dio->bi_status = errno_to_blk_status(r);
1384         dec_in_flight(dio);
1385 }
1386
1387 static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
1388 {
1389         struct dm_integrity_c *ic = ti->private;
1390         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1391         struct bio_integrity_payload *bip;
1392
1393         sector_t area, offset;
1394
1395         dio->ic = ic;
1396         dio->bi_status = 0;
1397
1398         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1399                 submit_flush_bio(ic, dio);
1400                 return DM_MAPIO_SUBMITTED;
1401         }
1402
1403         dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1404         dio->write = bio_op(bio) == REQ_OP_WRITE;
1405         dio->fua = dio->write && bio->bi_opf & REQ_FUA;
1406         if (unlikely(dio->fua)) {
1407                 /*
1408                  * Don't pass down the FUA flag because we have to flush
1409                  * disk cache anyway.
1410                  */
1411                 bio->bi_opf &= ~REQ_FUA;
1412         }
1413         if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
1414                 DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
1415                       (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
1416                       (unsigned long long)ic->provided_data_sectors);
1417                 return DM_MAPIO_KILL;
1418         }
1419         if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
1420                 DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
1421                       ic->sectors_per_block,
1422                       (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
1423                 return DM_MAPIO_KILL;
1424         }
1425
1426         if (ic->sectors_per_block > 1) {
1427                 struct bvec_iter iter;
1428                 struct bio_vec bv;
1429                 bio_for_each_segment(bv, bio, iter) {
1430                         if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
1431                                 DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
1432                                         bv.bv_offset, bv.bv_len, ic->sectors_per_block);
1433                                 return DM_MAPIO_KILL;
1434                         }
1435                 }
1436         }
1437
1438         bip = bio_integrity(bio);
1439         if (!ic->internal_hash) {
1440                 if (bip) {
1441                         unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
1442                         if (ic->log2_tag_size >= 0)
1443                                 wanted_tag_size <<= ic->log2_tag_size;
1444                         else
1445                                 wanted_tag_size *= ic->tag_size;
1446                         if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
1447                                 DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
1448                                 return DM_MAPIO_KILL;
1449                         }
1450                 }
1451         } else {
1452                 if (unlikely(bip != NULL)) {
1453                         DMERR("Unexpected integrity data when using internal hash");
1454                         return DM_MAPIO_KILL;
1455                 }
1456         }
1457
1458         if (unlikely(ic->mode == 'R') && unlikely(dio->write))
1459                 return DM_MAPIO_KILL;
1460
1461         get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1462         dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1463         bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
1464
1465         dm_integrity_map_continue(dio, true);
1466         return DM_MAPIO_SUBMITTED;
1467 }
1468
1469 static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
1470                                  unsigned journal_section, unsigned journal_entry)
1471 {
1472         struct dm_integrity_c *ic = dio->ic;
1473         sector_t logical_sector;
1474         unsigned n_sectors;
1475
1476         logical_sector = dio->range.logical_sector;
1477         n_sectors = dio->range.n_sectors;
1478         do {
1479                 struct bio_vec bv = bio_iovec(bio);
1480                 char *mem;
1481
1482                 if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
1483                         bv.bv_len = n_sectors << SECTOR_SHIFT;
1484                 n_sectors -= bv.bv_len >> SECTOR_SHIFT;
1485                 bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
1486 retry_kmap:
1487                 mem = kmap_atomic(bv.bv_page);
1488                 if (likely(dio->write))
1489                         flush_dcache_page(bv.bv_page);
1490
1491                 do {
1492                         struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
1493
1494                         if (unlikely(!dio->write)) {
1495                                 struct journal_sector *js;
1496                                 char *mem_ptr;
1497                                 unsigned s;
1498
1499                                 if (unlikely(journal_entry_is_inprogress(je))) {
1500                                         flush_dcache_page(bv.bv_page);
1501                                         kunmap_atomic(mem);
1502
1503                                         __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
1504                                         goto retry_kmap;
1505                                 }
1506                                 smp_rmb();
1507                                 BUG_ON(journal_entry_get_sector(je) != logical_sector);
1508                                 js = access_journal_data(ic, journal_section, journal_entry);
1509                                 mem_ptr = mem + bv.bv_offset;
1510                                 s = 0;
1511                                 do {
1512                                         memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
1513                                         *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
1514                                         js++;
1515                                         mem_ptr += 1 << SECTOR_SHIFT;
1516                                 } while (++s < ic->sectors_per_block);
1517 #ifdef INTERNAL_VERIFY
1518                                 if (ic->internal_hash) {
1519                                         char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
1520
1521                                         integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
1522                                         if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
1523                                                 DMERR("Checksum failed when reading from journal, at sector 0x%llx",
1524                                                       (unsigned long long)logical_sector);
1525                                         }
1526                                 }
1527 #endif
1528                         }
1529
1530                         if (!ic->internal_hash) {
1531                                 struct bio_integrity_payload *bip = bio_integrity(bio);
1532                                 unsigned tag_todo = ic->tag_size;
1533                                 char *tag_ptr = journal_entry_tag(ic, je);
1534
1535                                 if (bip) do {
1536                                         struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
1537                                         unsigned tag_now = min(biv.bv_len, tag_todo);
1538                                         char *tag_addr;
1539                                         BUG_ON(PageHighMem(biv.bv_page));
1540                                         tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1541                                         if (likely(dio->write))
1542                                                 memcpy(tag_ptr, tag_addr, tag_now);
1543                                         else
1544                                                 memcpy(tag_addr, tag_ptr, tag_now);
1545                                         bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
1546                                         tag_ptr += tag_now;
1547                                         tag_todo -= tag_now;
1548                                 } while (unlikely(tag_todo)); else {
1549                                         if (likely(dio->write))
1550                                                 memset(tag_ptr, 0, tag_todo);
1551                                 }
1552                         }
1553
1554                         if (likely(dio->write)) {
1555                                 struct journal_sector *js;
1556                                 unsigned s;
1557
1558                                 js = access_journal_data(ic, journal_section, journal_entry);
1559                                 memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
1560
1561                                 s = 0;
1562                                 do {
1563                                         je->last_bytes[s] = js[s].commit_id;
1564                                 } while (++s < ic->sectors_per_block);
1565
1566                                 if (ic->internal_hash) {
1567                                         unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1568                                         if (unlikely(digest_size > ic->tag_size)) {
1569                                                 char checksums_onstack[digest_size];
1570                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
1571                                                 memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
1572                                         } else
1573                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
1574                                 }
1575
1576                                 journal_entry_set_sector(je, logical_sector);
1577                         }
1578                         logical_sector += ic->sectors_per_block;
1579
1580                         journal_entry++;
1581                         if (unlikely(journal_entry == ic->journal_section_entries)) {
1582                                 journal_entry = 0;
1583                                 journal_section++;
1584                                 wraparound_section(ic, &journal_section);
1585                         }
1586
1587                         bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
1588                 } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
1589
1590                 if (unlikely(!dio->write))
1591                         flush_dcache_page(bv.bv_page);
1592                 kunmap_atomic(mem);
1593         } while (n_sectors);
1594
1595         if (likely(dio->write)) {
1596                 smp_mb();
1597                 if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
1598                         wake_up(&ic->copy_to_journal_wait);
1599                 if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
1600                         queue_work(ic->commit_wq, &ic->commit_work);
1601                 } else {
1602                         schedule_autocommit(ic);
1603                 }
1604         } else {
1605                 remove_range(ic, &dio->range);
1606         }
1607
1608         if (unlikely(bio->bi_iter.bi_size)) {
1609                 sector_t area, offset;
1610
1611                 dio->range.logical_sector = logical_sector;
1612                 get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1613                 dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1614                 return true;
1615         }
1616
1617         return false;
1618 }
1619
1620 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
1621 {
1622         struct dm_integrity_c *ic = dio->ic;
1623         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1624         unsigned journal_section, journal_entry;
1625         unsigned journal_read_pos;
1626         struct completion read_comp;
1627         bool need_sync_io = ic->internal_hash && !dio->write;
1628
1629         if (need_sync_io && from_map) {
1630                 INIT_WORK(&dio->work, integrity_bio_wait);
1631                 queue_work(ic->metadata_wq, &dio->work);
1632                 return;
1633         }
1634
1635 lock_retry:
1636         spin_lock_irq(&ic->endio_wait.lock);
1637 retry:
1638         if (unlikely(dm_integrity_failed(ic))) {
1639                 spin_unlock_irq(&ic->endio_wait.lock);
1640                 do_endio(ic, bio);
1641                 return;
1642         }
1643         dio->range.n_sectors = bio_sectors(bio);
1644         journal_read_pos = NOT_FOUND;
1645         if (likely(ic->mode == 'J')) {
1646                 if (dio->write) {
1647                         unsigned next_entry, i, pos;
1648                         unsigned ws, we, range_sectors;
1649
1650                         dio->range.n_sectors = min(dio->range.n_sectors,
1651                                                    ic->free_sectors << ic->sb->log2_sectors_per_block);
1652                         if (unlikely(!dio->range.n_sectors)) {
1653                                 if (from_map)
1654                                         goto offload_to_thread;
1655                                 sleep_on_endio_wait(ic);
1656                                 goto retry;
1657                         }
1658                         range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
1659                         ic->free_sectors -= range_sectors;
1660                         journal_section = ic->free_section;
1661                         journal_entry = ic->free_section_entry;
1662
1663                         next_entry = ic->free_section_entry + range_sectors;
1664                         ic->free_section_entry = next_entry % ic->journal_section_entries;
1665                         ic->free_section += next_entry / ic->journal_section_entries;
1666                         ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
1667                         wraparound_section(ic, &ic->free_section);
1668
1669                         pos = journal_section * ic->journal_section_entries + journal_entry;
1670                         ws = journal_section;
1671                         we = journal_entry;
1672                         i = 0;
1673                         do {
1674                                 struct journal_entry *je;
1675
1676                                 add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
1677                                 pos++;
1678                                 if (unlikely(pos >= ic->journal_entries))
1679                                         pos = 0;
1680
1681                                 je = access_journal_entry(ic, ws, we);
1682                                 BUG_ON(!journal_entry_is_unused(je));
1683                                 journal_entry_set_inprogress(je);
1684                                 we++;
1685                                 if (unlikely(we == ic->journal_section_entries)) {
1686                                         we = 0;
1687                                         ws++;
1688                                         wraparound_section(ic, &ws);
1689                                 }
1690                         } while ((i += ic->sectors_per_block) < dio->range.n_sectors);
1691
1692                         spin_unlock_irq(&ic->endio_wait.lock);
1693                         goto journal_read_write;
1694                 } else {
1695                         sector_t next_sector;
1696                         journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
1697                         if (likely(journal_read_pos == NOT_FOUND)) {
1698                                 if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
1699                                         dio->range.n_sectors = next_sector - dio->range.logical_sector;
1700                         } else {
1701                                 unsigned i;
1702                                 unsigned jp = journal_read_pos + 1;
1703                                 for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
1704                                         if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
1705                                                 break;
1706                                 }
1707                                 dio->range.n_sectors = i;
1708                         }
1709                 }
1710         }
1711         if (unlikely(!add_new_range(ic, &dio->range, true))) {
1712                 /*
1713                  * We must not sleep in the request routine because it could
1714                  * stall bios on current->bio_list.
1715                  * So, we offload the bio to a workqueue if we have to sleep.
1716                  */
1717                 if (from_map) {
1718 offload_to_thread:
1719                         spin_unlock_irq(&ic->endio_wait.lock);
1720                         INIT_WORK(&dio->work, integrity_bio_wait);
1721                         queue_work(ic->wait_wq, &dio->work);
1722                         return;
1723                 }
1724                 wait_and_add_new_range(ic, &dio->range);
1725         }
1726         spin_unlock_irq(&ic->endio_wait.lock);
1727
1728         if (unlikely(journal_read_pos != NOT_FOUND)) {
1729                 journal_section = journal_read_pos / ic->journal_section_entries;
1730                 journal_entry = journal_read_pos % ic->journal_section_entries;
1731                 goto journal_read_write;
1732         }
1733
1734         dio->in_flight = (atomic_t)ATOMIC_INIT(2);
1735
1736         if (need_sync_io) {
1737                 init_completion(&read_comp);
1738                 dio->completion = &read_comp;
1739         } else
1740                 dio->completion = NULL;
1741
1742         dio->orig_bi_iter = bio->bi_iter;
1743
1744         dio->orig_bi_disk = bio->bi_disk;
1745         dio->orig_bi_partno = bio->bi_partno;
1746         bio_set_dev(bio, ic->dev->bdev);
1747
1748         dio->orig_bi_integrity = bio_integrity(bio);
1749         bio->bi_integrity = NULL;
1750         bio->bi_opf &= ~REQ_INTEGRITY;
1751
1752         dio->orig_bi_end_io = bio->bi_end_io;
1753         bio->bi_end_io = integrity_end_io;
1754
1755         bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
1756         bio->bi_iter.bi_sector += ic->start;
1757         generic_make_request(bio);
1758
1759         if (need_sync_io) {
1760                 wait_for_completion_io(&read_comp);
1761                 if (likely(!bio->bi_status))
1762                         integrity_metadata(&dio->work);
1763                 else
1764                         dec_in_flight(dio);
1765
1766         } else {
1767                 INIT_WORK(&dio->work, integrity_metadata);
1768                 queue_work(ic->metadata_wq, &dio->work);
1769         }
1770
1771         return;
1772
1773 journal_read_write:
1774         if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
1775                 goto lock_retry;
1776
1777         do_endio_flush(ic, dio);
1778 }
1779
1780
1781 static void integrity_bio_wait(struct work_struct *w)
1782 {
1783         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1784
1785         dm_integrity_map_continue(dio, false);
1786 }
1787
1788 static void pad_uncommitted(struct dm_integrity_c *ic)
1789 {
1790         if (ic->free_section_entry) {
1791                 ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
1792                 ic->free_section_entry = 0;
1793                 ic->free_section++;
1794                 wraparound_section(ic, &ic->free_section);
1795                 ic->n_uncommitted_sections++;
1796         }
1797         WARN_ON(ic->journal_sections * ic->journal_section_entries !=
1798                 (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
1799 }
1800
1801 static void integrity_commit(struct work_struct *w)
1802 {
1803         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
1804         unsigned commit_start, commit_sections;
1805         unsigned i, j, n;
1806         struct bio *flushes;
1807
1808         del_timer(&ic->autocommit_timer);
1809
1810         spin_lock_irq(&ic->endio_wait.lock);
1811         flushes = bio_list_get(&ic->flush_bio_list);
1812         if (unlikely(ic->mode != 'J')) {
1813                 spin_unlock_irq(&ic->endio_wait.lock);
1814                 dm_integrity_flush_buffers(ic);
1815                 goto release_flush_bios;
1816         }
1817
1818         pad_uncommitted(ic);
1819         commit_start = ic->uncommitted_section;
1820         commit_sections = ic->n_uncommitted_sections;
1821         spin_unlock_irq(&ic->endio_wait.lock);
1822
1823         if (!commit_sections)
1824                 goto release_flush_bios;
1825
1826         i = commit_start;
1827         for (n = 0; n < commit_sections; n++) {
1828                 for (j = 0; j < ic->journal_section_entries; j++) {
1829                         struct journal_entry *je;
1830                         je = access_journal_entry(ic, i, j);
1831                         io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
1832                 }
1833                 for (j = 0; j < ic->journal_section_sectors; j++) {
1834                         struct journal_sector *js;
1835                         js = access_journal(ic, i, j);
1836                         js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
1837                 }
1838                 i++;
1839                 if (unlikely(i >= ic->journal_sections))
1840                         ic->commit_seq = next_commit_seq(ic->commit_seq);
1841                 wraparound_section(ic, &i);
1842         }
1843         smp_rmb();
1844
1845         write_journal(ic, commit_start, commit_sections);
1846
1847         spin_lock_irq(&ic->endio_wait.lock);
1848         ic->uncommitted_section += commit_sections;
1849         wraparound_section(ic, &ic->uncommitted_section);
1850         ic->n_uncommitted_sections -= commit_sections;
1851         ic->n_committed_sections += commit_sections;
1852         spin_unlock_irq(&ic->endio_wait.lock);
1853
1854         if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
1855                 queue_work(ic->writer_wq, &ic->writer_work);
1856
1857 release_flush_bios:
1858         while (flushes) {
1859                 struct bio *next = flushes->bi_next;
1860                 flushes->bi_next = NULL;
1861                 do_endio(ic, flushes);
1862                 flushes = next;
1863         }
1864 }
1865
1866 static void complete_copy_from_journal(unsigned long error, void *context)
1867 {
1868         struct journal_io *io = context;
1869         struct journal_completion *comp = io->comp;
1870         struct dm_integrity_c *ic = comp->ic;
1871         remove_range(ic, &io->range);
1872         mempool_free(io, &ic->journal_io_mempool);
1873         if (unlikely(error != 0))
1874                 dm_integrity_io_error(ic, "copying from journal", -EIO);
1875         complete_journal_op(comp);
1876 }
1877
1878 static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
1879                                struct journal_entry *je)
1880 {
1881         unsigned s = 0;
1882         do {
1883                 js->commit_id = je->last_bytes[s];
1884                 js++;
1885         } while (++s < ic->sectors_per_block);
1886 }
1887
1888 static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
1889                              unsigned write_sections, bool from_replay)
1890 {
1891         unsigned i, j, n;
1892         struct journal_completion comp;
1893         struct blk_plug plug;
1894
1895         blk_start_plug(&plug);
1896
1897         comp.ic = ic;
1898         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
1899         init_completion(&comp.comp);
1900
1901         i = write_start;
1902         for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
1903 #ifndef INTERNAL_VERIFY
1904                 if (unlikely(from_replay))
1905 #endif
1906                         rw_section_mac(ic, i, false);
1907                 for (j = 0; j < ic->journal_section_entries; j++) {
1908                         struct journal_entry *je = access_journal_entry(ic, i, j);
1909                         sector_t sec, area, offset;
1910                         unsigned k, l, next_loop;
1911                         sector_t metadata_block;
1912                         unsigned metadata_offset;
1913                         struct journal_io *io;
1914
1915                         if (journal_entry_is_unused(je))
1916                                 continue;
1917                         BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
1918                         sec = journal_entry_get_sector(je);
1919                         if (unlikely(from_replay)) {
1920                                 if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
1921                                         dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
1922                                         sec &= ~(sector_t)(ic->sectors_per_block - 1);
1923                                 }
1924                         }
1925                         get_area_and_offset(ic, sec, &area, &offset);
1926                         restore_last_bytes(ic, access_journal_data(ic, i, j), je);
1927                         for (k = j + 1; k < ic->journal_section_entries; k++) {
1928                                 struct journal_entry *je2 = access_journal_entry(ic, i, k);
1929                                 sector_t sec2, area2, offset2;
1930                                 if (journal_entry_is_unused(je2))
1931                                         break;
1932                                 BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
1933                                 sec2 = journal_entry_get_sector(je2);
1934                                 get_area_and_offset(ic, sec2, &area2, &offset2);
1935                                 if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
1936                                         break;
1937                                 restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
1938                         }
1939                         next_loop = k - 1;
1940
1941                         io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO);
1942                         io->comp = &comp;
1943                         io->range.logical_sector = sec;
1944                         io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
1945
1946                         spin_lock_irq(&ic->endio_wait.lock);
1947                         if (unlikely(!add_new_range(ic, &io->range, true)))
1948                                 wait_and_add_new_range(ic, &io->range);
1949
1950                         if (likely(!from_replay)) {
1951                                 struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
1952
1953                                 /* don't write if there is newer committed sector */
1954                                 while (j < k && find_newer_committed_node(ic, &section_node[j])) {
1955                                         struct journal_entry *je2 = access_journal_entry(ic, i, j);
1956
1957                                         journal_entry_set_unused(je2);
1958                                         remove_journal_node(ic, &section_node[j]);
1959                                         j++;
1960                                         sec += ic->sectors_per_block;
1961                                         offset += ic->sectors_per_block;
1962                                 }
1963                                 while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
1964                                         struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
1965
1966                                         journal_entry_set_unused(je2);
1967                                         remove_journal_node(ic, &section_node[k - 1]);
1968                                         k--;
1969                                 }
1970                                 if (j == k) {
1971                                         remove_range_unlocked(ic, &io->range);
1972                                         spin_unlock_irq(&ic->endio_wait.lock);
1973                                         mempool_free(io, &ic->journal_io_mempool);
1974                                         goto skip_io;
1975                                 }
1976                                 for (l = j; l < k; l++) {
1977                                         remove_journal_node(ic, &section_node[l]);
1978                                 }
1979                         }
1980                         spin_unlock_irq(&ic->endio_wait.lock);
1981
1982                         metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
1983                         for (l = j; l < k; l++) {
1984                                 int r;
1985                                 struct journal_entry *je2 = access_journal_entry(ic, i, l);
1986
1987                                 if (
1988 #ifndef INTERNAL_VERIFY
1989                                     unlikely(from_replay) &&
1990 #endif
1991                                     ic->internal_hash) {
1992                                         char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
1993
1994                                         integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
1995                                                                   (char *)access_journal_data(ic, i, l), test_tag);
1996                                         if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size)))
1997                                                 dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
1998                                 }
1999
2000                                 journal_entry_set_unused(je2);
2001                                 r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
2002                                                         ic->tag_size, TAG_WRITE);
2003                                 if (unlikely(r)) {
2004                                         dm_integrity_io_error(ic, "reading tags", r);
2005                                 }
2006                         }
2007
2008                         atomic_inc(&comp.in_flight);
2009                         copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
2010                                           (k - j) << ic->sb->log2_sectors_per_block,
2011                                           get_data_sector(ic, area, offset),
2012                                           complete_copy_from_journal, io);
2013 skip_io:
2014                         j = next_loop;
2015                 }
2016         }
2017
2018         dm_bufio_write_dirty_buffers_async(ic->bufio);
2019
2020         blk_finish_plug(&plug);
2021
2022         complete_journal_op(&comp);
2023         wait_for_completion_io(&comp.comp);
2024
2025         dm_integrity_flush_buffers(ic);
2026 }
2027
2028 static void integrity_writer(struct work_struct *w)
2029 {
2030         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
2031         unsigned write_start, write_sections;
2032
2033         unsigned prev_free_sectors;
2034
2035         /* the following test is not needed, but it tests the replay code */
2036         if (READ_ONCE(ic->suspending))
2037                 return;
2038
2039         spin_lock_irq(&ic->endio_wait.lock);
2040         write_start = ic->committed_section;
2041         write_sections = ic->n_committed_sections;
2042         spin_unlock_irq(&ic->endio_wait.lock);
2043
2044         if (!write_sections)
2045                 return;
2046
2047         do_journal_write(ic, write_start, write_sections, false);
2048
2049         spin_lock_irq(&ic->endio_wait.lock);
2050
2051         ic->committed_section += write_sections;
2052         wraparound_section(ic, &ic->committed_section);
2053         ic->n_committed_sections -= write_sections;
2054
2055         prev_free_sectors = ic->free_sectors;
2056         ic->free_sectors += write_sections * ic->journal_section_entries;
2057         if (unlikely(!prev_free_sectors))
2058                 wake_up_locked(&ic->endio_wait);
2059
2060         spin_unlock_irq(&ic->endio_wait.lock);
2061 }
2062
2063 static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
2064                          unsigned n_sections, unsigned char commit_seq)
2065 {
2066         unsigned i, j, n;
2067
2068         if (!n_sections)
2069                 return;
2070
2071         for (n = 0; n < n_sections; n++) {
2072                 i = start_section + n;
2073                 wraparound_section(ic, &i);
2074                 for (j = 0; j < ic->journal_section_sectors; j++) {
2075                         struct journal_sector *js = access_journal(ic, i, j);
2076                         memset(&js->entries, 0, JOURNAL_SECTOR_DATA);
2077                         js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
2078                 }
2079                 for (j = 0; j < ic->journal_section_entries; j++) {
2080                         struct journal_entry *je = access_journal_entry(ic, i, j);
2081                         journal_entry_set_unused(je);
2082                 }
2083         }
2084
2085         write_journal(ic, start_section, n_sections);
2086 }
2087
2088 static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
2089 {
2090         unsigned char k;
2091         for (k = 0; k < N_COMMIT_IDS; k++) {
2092                 if (dm_integrity_commit_id(ic, i, j, k) == id)
2093                         return k;
2094         }
2095         dm_integrity_io_error(ic, "journal commit id", -EIO);
2096         return -EIO;
2097 }
2098
2099 static void replay_journal(struct dm_integrity_c *ic)
2100 {
2101         unsigned i, j;
2102         bool used_commit_ids[N_COMMIT_IDS];
2103         unsigned max_commit_id_sections[N_COMMIT_IDS];
2104         unsigned write_start, write_sections;
2105         unsigned continue_section;
2106         bool journal_empty;
2107         unsigned char unused, last_used, want_commit_seq;
2108
2109         if (ic->mode == 'R')
2110                 return;
2111
2112         if (ic->journal_uptodate)
2113                 return;
2114
2115         last_used = 0;
2116         write_start = 0;
2117
2118         if (!ic->just_formatted) {
2119                 DEBUG_print("reading journal\n");
2120                 rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
2121                 if (ic->journal_io)
2122                         DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
2123                 if (ic->journal_io) {
2124                         struct journal_completion crypt_comp;
2125                         crypt_comp.ic = ic;
2126                         init_completion(&crypt_comp.comp);
2127                         crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
2128                         encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
2129                         wait_for_completion(&crypt_comp.comp);
2130                 }
2131                 DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
2132         }
2133
2134         if (dm_integrity_failed(ic))
2135                 goto clear_journal;
2136
2137         journal_empty = true;
2138         memset(used_commit_ids, 0, sizeof used_commit_ids);
2139         memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
2140         for (i = 0; i < ic->journal_sections; i++) {
2141                 for (j = 0; j < ic->journal_section_sectors; j++) {
2142                         int k;
2143                         struct journal_sector *js = access_journal(ic, i, j);
2144                         k = find_commit_seq(ic, i, j, js->commit_id);
2145                         if (k < 0)
2146                                 goto clear_journal;
2147                         used_commit_ids[k] = true;
2148                         max_commit_id_sections[k] = i;
2149                 }
2150                 if (journal_empty) {
2151                         for (j = 0; j < ic->journal_section_entries; j++) {
2152                                 struct journal_entry *je = access_journal_entry(ic, i, j);
2153                                 if (!journal_entry_is_unused(je)) {
2154                                         journal_empty = false;
2155                                         break;
2156                                 }
2157                         }
2158                 }
2159         }
2160
2161         if (!used_commit_ids[N_COMMIT_IDS - 1]) {
2162                 unused = N_COMMIT_IDS - 1;
2163                 while (unused && !used_commit_ids[unused - 1])
2164                         unused--;
2165         } else {
2166                 for (unused = 0; unused < N_COMMIT_IDS; unused++)
2167                         if (!used_commit_ids[unused])
2168                                 break;
2169                 if (unused == N_COMMIT_IDS) {
2170                         dm_integrity_io_error(ic, "journal commit ids", -EIO);
2171                         goto clear_journal;
2172                 }
2173         }
2174         DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
2175                     unused, used_commit_ids[0], used_commit_ids[1],
2176                     used_commit_ids[2], used_commit_ids[3]);
2177
2178         last_used = prev_commit_seq(unused);
2179         want_commit_seq = prev_commit_seq(last_used);
2180
2181         if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
2182                 journal_empty = true;
2183
2184         write_start = max_commit_id_sections[last_used] + 1;
2185         if (unlikely(write_start >= ic->journal_sections))
2186                 want_commit_seq = next_commit_seq(want_commit_seq);
2187         wraparound_section(ic, &write_start);
2188
2189         i = write_start;
2190         for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
2191                 for (j = 0; j < ic->journal_section_sectors; j++) {
2192                         struct journal_sector *js = access_journal(ic, i, j);
2193
2194                         if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
2195                                 /*
2196                                  * This could be caused by crash during writing.
2197                                  * We won't replay the inconsistent part of the
2198                                  * journal.
2199                                  */
2200                                 DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
2201                                             i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
2202                                 goto brk;
2203                         }
2204                 }
2205                 i++;
2206                 if (unlikely(i >= ic->journal_sections))
2207                         want_commit_seq = next_commit_seq(want_commit_seq);
2208                 wraparound_section(ic, &i);
2209         }
2210 brk:
2211
2212         if (!journal_empty) {
2213                 DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
2214                             write_sections, write_start, want_commit_seq);
2215                 do_journal_write(ic, write_start, write_sections, true);
2216         }
2217
2218         if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
2219                 continue_section = write_start;
2220                 ic->commit_seq = want_commit_seq;
2221                 DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
2222         } else {
2223                 unsigned s;
2224                 unsigned char erase_seq;
2225 clear_journal:
2226                 DEBUG_print("clearing journal\n");
2227
2228                 erase_seq = prev_commit_seq(prev_commit_seq(last_used));
2229                 s = write_start;
2230                 init_journal(ic, s, 1, erase_seq);
2231                 s++;
2232                 wraparound_section(ic, &s);
2233                 if (ic->journal_sections >= 2) {
2234                         init_journal(ic, s, ic->journal_sections - 2, erase_seq);
2235                         s += ic->journal_sections - 2;
2236                         wraparound_section(ic, &s);
2237                         init_journal(ic, s, 1, erase_seq);
2238                 }
2239
2240                 continue_section = 0;
2241                 ic->commit_seq = next_commit_seq(erase_seq);
2242         }
2243
2244         ic->committed_section = continue_section;
2245         ic->n_committed_sections = 0;
2246
2247         ic->uncommitted_section = continue_section;
2248         ic->n_uncommitted_sections = 0;
2249
2250         ic->free_section = continue_section;
2251         ic->free_section_entry = 0;
2252         ic->free_sectors = ic->journal_entries;
2253
2254         ic->journal_tree_root = RB_ROOT;
2255         for (i = 0; i < ic->journal_entries; i++)
2256                 init_journal_node(&ic->journal_tree[i]);
2257 }
2258
2259 static void dm_integrity_postsuspend(struct dm_target *ti)
2260 {
2261         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2262
2263         del_timer_sync(&ic->autocommit_timer);
2264
2265         WRITE_ONCE(ic->suspending, 1);
2266
2267         queue_work(ic->commit_wq, &ic->commit_work);
2268         drain_workqueue(ic->commit_wq);
2269
2270         if (ic->mode == 'J') {
2271                 drain_workqueue(ic->writer_wq);
2272                 dm_integrity_flush_buffers(ic);
2273         }
2274
2275         WRITE_ONCE(ic->suspending, 0);
2276
2277         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
2278
2279         ic->journal_uptodate = true;
2280 }
2281
2282 static void dm_integrity_resume(struct dm_target *ti)
2283 {
2284         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2285
2286         replay_journal(ic);
2287 }
2288
2289 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
2290                                 unsigned status_flags, char *result, unsigned maxlen)
2291 {
2292         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2293         unsigned arg_count;
2294         size_t sz = 0;
2295
2296         switch (type) {
2297         case STATUSTYPE_INFO:
2298                 DMEMIT("%llu %llu",
2299                         (unsigned long long)atomic64_read(&ic->number_of_mismatches),
2300                         (unsigned long long)ic->provided_data_sectors);
2301                 break;
2302
2303         case STATUSTYPE_TABLE: {
2304                 __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
2305                 watermark_percentage += ic->journal_entries / 2;
2306                 do_div(watermark_percentage, ic->journal_entries);
2307                 arg_count = 5;
2308                 arg_count += ic->sectors_per_block != 1;
2309                 arg_count += !!ic->internal_hash_alg.alg_string;
2310                 arg_count += !!ic->journal_crypt_alg.alg_string;
2311                 arg_count += !!ic->journal_mac_alg.alg_string;
2312                 DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
2313                        ic->tag_size, ic->mode, arg_count);
2314                 DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
2315                 DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
2316                 DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
2317                 DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
2318                 DMEMIT(" commit_time:%u", ic->autocommit_msec);
2319                 if (ic->sectors_per_block != 1)
2320                         DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
2321
2322 #define EMIT_ALG(a, n)                                                  \
2323                 do {                                                    \
2324                         if (ic->a.alg_string) {                         \
2325                                 DMEMIT(" %s:%s", n, ic->a.alg_string);  \
2326                                 if (ic->a.key_string)                   \
2327                                         DMEMIT(":%s", ic->a.key_string);\
2328                         }                                               \
2329                 } while (0)
2330                 EMIT_ALG(internal_hash_alg, "internal_hash");
2331                 EMIT_ALG(journal_crypt_alg, "journal_crypt");
2332                 EMIT_ALG(journal_mac_alg, "journal_mac");
2333                 break;
2334         }
2335         }
2336 }
2337
2338 static int dm_integrity_iterate_devices(struct dm_target *ti,
2339                                         iterate_devices_callout_fn fn, void *data)
2340 {
2341         struct dm_integrity_c *ic = ti->private;
2342
2343         return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
2344 }
2345
2346 static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
2347 {
2348         struct dm_integrity_c *ic = ti->private;
2349
2350         if (ic->sectors_per_block > 1) {
2351                 limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
2352                 limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
2353                 blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
2354         }
2355 }
2356
2357 static void calculate_journal_section_size(struct dm_integrity_c *ic)
2358 {
2359         unsigned sector_space = JOURNAL_SECTOR_DATA;
2360
2361         ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
2362         ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
2363                                          JOURNAL_ENTRY_ROUNDUP);
2364
2365         if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
2366                 sector_space -= JOURNAL_MAC_PER_SECTOR;
2367         ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
2368         ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
2369         ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
2370         ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
2371 }
2372
2373 static int calculate_device_limits(struct dm_integrity_c *ic)
2374 {
2375         __u64 initial_sectors;
2376         sector_t last_sector, last_area, last_offset;
2377
2378         calculate_journal_section_size(ic);
2379         initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
2380         if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX)
2381                 return -EINVAL;
2382         ic->initial_sectors = initial_sectors;
2383
2384         ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
2385                                    (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
2386         if (!(ic->metadata_run & (ic->metadata_run - 1)))
2387                 ic->log2_metadata_run = __ffs(ic->metadata_run);
2388         else
2389                 ic->log2_metadata_run = -1;
2390
2391         get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
2392         last_sector = get_data_sector(ic, last_area, last_offset);
2393
2394         if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors)
2395                 return -EINVAL;
2396
2397         return 0;
2398 }
2399
2400 static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
2401 {
2402         unsigned journal_sections;
2403         int test_bit;
2404
2405         memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
2406         memcpy(ic->sb->magic, SB_MAGIC, 8);
2407         ic->sb->version = SB_VERSION;
2408         ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
2409         ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
2410         if (ic->journal_mac_alg.alg_string)
2411                 ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
2412
2413         calculate_journal_section_size(ic);
2414         journal_sections = journal_sectors / ic->journal_section_sectors;
2415         if (!journal_sections)
2416                 journal_sections = 1;
2417         ic->sb->journal_sections = cpu_to_le32(journal_sections);
2418
2419         if (!interleave_sectors)
2420                 interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
2421         ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
2422         ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
2423         ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
2424
2425         ic->provided_data_sectors = 0;
2426         for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
2427                 __u64 prev_data_sectors = ic->provided_data_sectors;
2428
2429                 ic->provided_data_sectors |= (sector_t)1 << test_bit;
2430                 if (calculate_device_limits(ic))
2431                         ic->provided_data_sectors = prev_data_sectors;
2432         }
2433
2434         if (!ic->provided_data_sectors)
2435                 return -EINVAL;
2436
2437         ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
2438
2439         return 0;
2440 }
2441
2442 static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
2443 {
2444         struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
2445         struct blk_integrity bi;
2446
2447         memset(&bi, 0, sizeof(bi));
2448         bi.profile = &dm_integrity_profile;
2449         bi.tuple_size = ic->tag_size;
2450         bi.tag_size = bi.tuple_size;
2451         bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
2452
2453         blk_integrity_register(disk, &bi);
2454         blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
2455 }
2456
2457 static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
2458 {
2459         unsigned i;
2460
2461         if (!pl)
2462                 return;
2463         for (i = 0; i < ic->journal_pages; i++)
2464                 if (pl[i].page)
2465                         __free_page(pl[i].page);
2466         kvfree(pl);
2467 }
2468
2469 static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
2470 {
2471         size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
2472         struct page_list *pl;
2473         unsigned i;
2474
2475         pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
2476         if (!pl)
2477                 return NULL;
2478
2479         for (i = 0; i < ic->journal_pages; i++) {
2480                 pl[i].page = alloc_page(GFP_KERNEL);
2481                 if (!pl[i].page) {
2482                         dm_integrity_free_page_list(ic, pl);
2483                         return NULL;
2484                 }
2485                 if (i)
2486                         pl[i - 1].next = &pl[i];
2487         }
2488
2489         return pl;
2490 }
2491
2492 static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
2493 {
2494         unsigned i;
2495         for (i = 0; i < ic->journal_sections; i++)
2496                 kvfree(sl[i]);
2497         kvfree(sl);
2498 }
2499
2500 static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
2501 {
2502         struct scatterlist **sl;
2503         unsigned i;
2504
2505         sl = kvmalloc_array(ic->journal_sections,
2506                             sizeof(struct scatterlist *),
2507                             GFP_KERNEL | __GFP_ZERO);
2508         if (!sl)
2509                 return NULL;
2510
2511         for (i = 0; i < ic->journal_sections; i++) {
2512                 struct scatterlist *s;
2513                 unsigned start_index, start_offset;
2514                 unsigned end_index, end_offset;
2515                 unsigned n_pages;
2516                 unsigned idx;
2517
2518                 page_list_location(ic, i, 0, &start_index, &start_offset);
2519                 page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
2520
2521                 n_pages = (end_index - start_index + 1);
2522
2523                 s = kvmalloc_array(n_pages, sizeof(struct scatterlist),
2524                                    GFP_KERNEL);
2525                 if (!s) {
2526                         dm_integrity_free_journal_scatterlist(ic, sl);
2527                         return NULL;
2528                 }
2529
2530                 sg_init_table(s, n_pages);
2531                 for (idx = start_index; idx <= end_index; idx++) {
2532                         char *va = lowmem_page_address(pl[idx].page);
2533                         unsigned start = 0, end = PAGE_SIZE;
2534                         if (idx == start_index)
2535                                 start = start_offset;
2536                         if (idx == end_index)
2537                                 end = end_offset + (1 << SECTOR_SHIFT);
2538                         sg_set_buf(&s[idx - start_index], va + start, end - start);
2539                 }
2540
2541                 sl[i] = s;
2542         }
2543
2544         return sl;
2545 }
2546
2547 static void free_alg(struct alg_spec *a)
2548 {
2549         kzfree(a->alg_string);
2550         kzfree(a->key);
2551         memset(a, 0, sizeof *a);
2552 }
2553
2554 static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
2555 {
2556         char *k;
2557
2558         free_alg(a);
2559
2560         a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
2561         if (!a->alg_string)
2562                 goto nomem;
2563
2564         k = strchr(a->alg_string, ':');
2565         if (k) {
2566                 *k = 0;
2567                 a->key_string = k + 1;
2568                 if (strlen(a->key_string) & 1)
2569                         goto inval;
2570
2571                 a->key_size = strlen(a->key_string) / 2;
2572                 a->key = kmalloc(a->key_size, GFP_KERNEL);
2573                 if (!a->key)
2574                         goto nomem;
2575                 if (hex2bin(a->key, a->key_string, a->key_size))
2576                         goto inval;
2577         }
2578
2579         return 0;
2580 inval:
2581         *error = error_inval;
2582         return -EINVAL;
2583 nomem:
2584         *error = "Out of memory for an argument";
2585         return -ENOMEM;
2586 }
2587
2588 static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
2589                    char *error_alg, char *error_key)
2590 {
2591         int r;
2592
2593         if (a->alg_string) {
2594                 *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
2595                 if (IS_ERR(*hash)) {
2596                         *error = error_alg;
2597                         r = PTR_ERR(*hash);
2598                         *hash = NULL;
2599                         return r;
2600                 }
2601
2602                 if (a->key) {
2603                         r = crypto_shash_setkey(*hash, a->key, a->key_size);
2604                         if (r) {
2605                                 *error = error_key;
2606                                 return r;
2607                         }
2608                 } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
2609                         *error = error_key;
2610                         return -ENOKEY;
2611                 }
2612         }
2613
2614         return 0;
2615 }
2616
2617 static int create_journal(struct dm_integrity_c *ic, char **error)
2618 {
2619         int r = 0;
2620         unsigned i;
2621         __u64 journal_pages, journal_desc_size, journal_tree_size;
2622         unsigned char *crypt_data = NULL, *crypt_iv = NULL;
2623         struct skcipher_request *req = NULL;
2624
2625         ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
2626         ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
2627         ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
2628         ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
2629
2630         journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
2631                                 PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
2632         journal_desc_size = journal_pages * sizeof(struct page_list);
2633         if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
2634                 *error = "Journal doesn't fit into memory";
2635                 r = -ENOMEM;
2636                 goto bad;
2637         }
2638         ic->journal_pages = journal_pages;
2639
2640         ic->journal = dm_integrity_alloc_page_list(ic);
2641         if (!ic->journal) {
2642                 *error = "Could not allocate memory for journal";
2643                 r = -ENOMEM;
2644                 goto bad;
2645         }
2646         if (ic->journal_crypt_alg.alg_string) {
2647                 unsigned ivsize, blocksize;
2648                 struct journal_completion comp;
2649
2650                 comp.ic = ic;
2651                 ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
2652                 if (IS_ERR(ic->journal_crypt)) {
2653                         *error = "Invalid journal cipher";
2654                         r = PTR_ERR(ic->journal_crypt);
2655                         ic->journal_crypt = NULL;
2656                         goto bad;
2657                 }
2658                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
2659                 blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
2660
2661                 if (ic->journal_crypt_alg.key) {
2662                         r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
2663                                                    ic->journal_crypt_alg.key_size);
2664                         if (r) {
2665                                 *error = "Error setting encryption key";
2666                                 goto bad;
2667                         }
2668                 }
2669                 DEBUG_print("cipher %s, block size %u iv size %u\n",
2670                             ic->journal_crypt_alg.alg_string, blocksize, ivsize);
2671
2672                 ic->journal_io = dm_integrity_alloc_page_list(ic);
2673                 if (!ic->journal_io) {
2674                         *error = "Could not allocate memory for journal io";
2675                         r = -ENOMEM;
2676                         goto bad;
2677                 }
2678
2679                 if (blocksize == 1) {
2680                         struct scatterlist *sg;
2681
2682                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
2683                         if (!req) {
2684                                 *error = "Could not allocate crypt request";
2685                                 r = -ENOMEM;
2686                                 goto bad;
2687                         }
2688
2689                         crypt_iv = kmalloc(ivsize, GFP_KERNEL);
2690                         if (!crypt_iv) {
2691                                 *error = "Could not allocate iv";
2692                                 r = -ENOMEM;
2693                                 goto bad;
2694                         }
2695
2696                         ic->journal_xor = dm_integrity_alloc_page_list(ic);
2697                         if (!ic->journal_xor) {
2698                                 *error = "Could not allocate memory for journal xor";
2699                                 r = -ENOMEM;
2700                                 goto bad;
2701                         }
2702
2703                         sg = kvmalloc_array(ic->journal_pages + 1,
2704                                             sizeof(struct scatterlist),
2705                                             GFP_KERNEL);
2706                         if (!sg) {
2707                                 *error = "Unable to allocate sg list";
2708                                 r = -ENOMEM;
2709                                 goto bad;
2710                         }
2711                         sg_init_table(sg, ic->journal_pages + 1);
2712                         for (i = 0; i < ic->journal_pages; i++) {
2713                                 char *va = lowmem_page_address(ic->journal_xor[i].page);
2714                                 clear_page(va);
2715                                 sg_set_buf(&sg[i], va, PAGE_SIZE);
2716                         }
2717                         sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
2718                         memset(crypt_iv, 0x00, ivsize);
2719
2720                         skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
2721                         init_completion(&comp.comp);
2722                         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2723                         if (do_crypt(true, req, &comp))
2724                                 wait_for_completion(&comp.comp);
2725                         kvfree(sg);
2726                         r = dm_integrity_failed(ic);
2727                         if (r) {
2728                                 *error = "Unable to encrypt journal";
2729                                 goto bad;
2730                         }
2731                         DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
2732
2733                         crypto_free_skcipher(ic->journal_crypt);
2734                         ic->journal_crypt = NULL;
2735                 } else {
2736                         unsigned crypt_len = roundup(ivsize, blocksize);
2737
2738                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
2739                         if (!req) {
2740                                 *error = "Could not allocate crypt request";
2741                                 r = -ENOMEM;
2742                                 goto bad;
2743                         }
2744
2745                         crypt_iv = kmalloc(ivsize, GFP_KERNEL);
2746                         if (!crypt_iv) {
2747                                 *error = "Could not allocate iv";
2748                                 r = -ENOMEM;
2749                                 goto bad;
2750                         }
2751
2752                         crypt_data = kmalloc(crypt_len, GFP_KERNEL);
2753                         if (!crypt_data) {
2754                                 *error = "Unable to allocate crypt data";
2755                                 r = -ENOMEM;
2756                                 goto bad;
2757                         }
2758
2759                         ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
2760                         if (!ic->journal_scatterlist) {
2761                                 *error = "Unable to allocate sg list";
2762                                 r = -ENOMEM;
2763                                 goto bad;
2764                         }
2765                         ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
2766                         if (!ic->journal_io_scatterlist) {
2767                                 *error = "Unable to allocate sg list";
2768                                 r = -ENOMEM;
2769                                 goto bad;
2770                         }
2771                         ic->sk_requests = kvmalloc_array(ic->journal_sections,
2772                                                          sizeof(struct skcipher_request *),
2773                                                          GFP_KERNEL | __GFP_ZERO);
2774                         if (!ic->sk_requests) {
2775                                 *error = "Unable to allocate sk requests";
2776                                 r = -ENOMEM;
2777                                 goto bad;
2778                         }
2779                         for (i = 0; i < ic->journal_sections; i++) {
2780                                 struct scatterlist sg;
2781                                 struct skcipher_request *section_req;
2782                                 __u32 section_le = cpu_to_le32(i);
2783
2784                                 memset(crypt_iv, 0x00, ivsize);
2785                                 memset(crypt_data, 0x00, crypt_len);
2786                                 memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
2787
2788                                 sg_init_one(&sg, crypt_data, crypt_len);
2789                                 skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv);
2790                                 init_completion(&comp.comp);
2791                                 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2792                                 if (do_crypt(true, req, &comp))
2793                                         wait_for_completion(&comp.comp);
2794
2795                                 r = dm_integrity_failed(ic);
2796                                 if (r) {
2797                                         *error = "Unable to generate iv";
2798                                         goto bad;
2799                                 }
2800
2801                                 section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
2802                                 if (!section_req) {
2803                                         *error = "Unable to allocate crypt request";
2804                                         r = -ENOMEM;
2805                                         goto bad;
2806                                 }
2807                                 section_req->iv = kmalloc_array(ivsize, 2,
2808                                                                 GFP_KERNEL);
2809                                 if (!section_req->iv) {
2810                                         skcipher_request_free(section_req);
2811                                         *error = "Unable to allocate iv";
2812                                         r = -ENOMEM;
2813                                         goto bad;
2814                                 }
2815                                 memcpy(section_req->iv + ivsize, crypt_data, ivsize);
2816                                 section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
2817                                 ic->sk_requests[i] = section_req;
2818                                 DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
2819                         }
2820                 }
2821         }
2822
2823         for (i = 0; i < N_COMMIT_IDS; i++) {
2824                 unsigned j;
2825 retest_commit_id:
2826                 for (j = 0; j < i; j++) {
2827                         if (ic->commit_ids[j] == ic->commit_ids[i]) {
2828                                 ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
2829                                 goto retest_commit_id;
2830                         }
2831                 }
2832                 DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
2833         }
2834
2835         journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
2836         if (journal_tree_size > ULONG_MAX) {
2837                 *error = "Journal doesn't fit into memory";
2838                 r = -ENOMEM;
2839                 goto bad;
2840         }
2841         ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
2842         if (!ic->journal_tree) {
2843                 *error = "Could not allocate memory for journal tree";
2844                 r = -ENOMEM;
2845         }
2846 bad:
2847         kfree(crypt_data);
2848         kfree(crypt_iv);
2849         skcipher_request_free(req);
2850
2851         return r;
2852 }
2853
2854 /*
2855  * Construct a integrity mapping
2856  *
2857  * Arguments:
2858  *      device
2859  *      offset from the start of the device
2860  *      tag size
2861  *      D - direct writes, J - journal writes, R - recovery mode
2862  *      number of optional arguments
2863  *      optional arguments:
2864  *              journal_sectors
2865  *              interleave_sectors
2866  *              buffer_sectors
2867  *              journal_watermark
2868  *              commit_time
2869  *              internal_hash
2870  *              journal_crypt
2871  *              journal_mac
2872  *              block_size
2873  */
2874 static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
2875 {
2876         struct dm_integrity_c *ic;
2877         char dummy;
2878         int r;
2879         unsigned extra_args;
2880         struct dm_arg_set as;
2881         static const struct dm_arg _args[] = {
2882                 {0, 9, "Invalid number of feature args"},
2883         };
2884         unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
2885         bool should_write_sb;
2886         __u64 threshold;
2887         unsigned long long start;
2888
2889 #define DIRECT_ARGUMENTS        4
2890
2891         if (argc <= DIRECT_ARGUMENTS) {
2892                 ti->error = "Invalid argument count";
2893                 return -EINVAL;
2894         }
2895
2896         ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
2897         if (!ic) {
2898                 ti->error = "Cannot allocate integrity context";
2899                 return -ENOMEM;
2900         }
2901         ti->private = ic;
2902         ti->per_io_data_size = sizeof(struct dm_integrity_io);
2903
2904         ic->in_progress = RB_ROOT;
2905         INIT_LIST_HEAD(&ic->wait_list);
2906         init_waitqueue_head(&ic->endio_wait);
2907         bio_list_init(&ic->flush_bio_list);
2908         init_waitqueue_head(&ic->copy_to_journal_wait);
2909         init_completion(&ic->crypto_backoff);
2910         atomic64_set(&ic->number_of_mismatches, 0);
2911
2912         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
2913         if (r) {
2914                 ti->error = "Device lookup failed";
2915                 goto bad;
2916         }
2917
2918         if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
2919                 ti->error = "Invalid starting offset";
2920                 r = -EINVAL;
2921                 goto bad;
2922         }
2923         ic->start = start;
2924
2925         if (strcmp(argv[2], "-")) {
2926                 if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
2927                         ti->error = "Invalid tag size";
2928                         r = -EINVAL;
2929                         goto bad;
2930                 }
2931         }
2932
2933         if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
2934                 ic->mode = argv[3][0];
2935         else {
2936                 ti->error = "Invalid mode (expecting J, D, R)";
2937                 r = -EINVAL;
2938                 goto bad;
2939         }
2940
2941         ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
2942         journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
2943                         ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
2944         interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
2945         buffer_sectors = DEFAULT_BUFFER_SECTORS;
2946         journal_watermark = DEFAULT_JOURNAL_WATERMARK;
2947         sync_msec = DEFAULT_SYNC_MSEC;
2948         ic->sectors_per_block = 1;
2949
2950         as.argc = argc - DIRECT_ARGUMENTS;
2951         as.argv = argv + DIRECT_ARGUMENTS;
2952         r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
2953         if (r)
2954                 goto bad;
2955
2956         while (extra_args--) {
2957                 const char *opt_string;
2958                 unsigned val;
2959                 opt_string = dm_shift_arg(&as);
2960                 if (!opt_string) {
2961                         r = -EINVAL;
2962                         ti->error = "Not enough feature arguments";
2963                         goto bad;
2964                 }
2965                 if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
2966                         journal_sectors = val;
2967                 else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
2968                         interleave_sectors = val;
2969                 else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
2970                         buffer_sectors = val;
2971                 else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
2972                         journal_watermark = val;
2973                 else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
2974                         sync_msec = val;
2975                 else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
2976                         if (val < 1 << SECTOR_SHIFT ||
2977                             val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
2978                             (val & (val -1))) {
2979                                 r = -EINVAL;
2980                                 ti->error = "Invalid block_size argument";
2981                                 goto bad;
2982                         }
2983                         ic->sectors_per_block = val >> SECTOR_SHIFT;
2984                 } else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
2985                         r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
2986                                             "Invalid internal_hash argument");
2987                         if (r)
2988                                 goto bad;
2989                 } else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
2990                         r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
2991                                             "Invalid journal_crypt argument");
2992                         if (r)
2993                                 goto bad;
2994                 } else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
2995                         r = get_alg_and_key(opt_string, &ic->journal_mac_alg,  &ti->error,
2996                                             "Invalid journal_mac argument");
2997                         if (r)
2998                                 goto bad;
2999                 } else {
3000                         r = -EINVAL;
3001                         ti->error = "Invalid argument";
3002                         goto bad;
3003                 }
3004         }
3005
3006         r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
3007                     "Invalid internal hash", "Error setting internal hash key");
3008         if (r)
3009                 goto bad;
3010
3011         r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
3012                     "Invalid journal mac", "Error setting journal mac key");
3013         if (r)
3014                 goto bad;
3015
3016         if (!ic->tag_size) {
3017                 if (!ic->internal_hash) {
3018                         ti->error = "Unknown tag size";
3019                         r = -EINVAL;
3020                         goto bad;
3021                 }
3022                 ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
3023         }
3024         if (ic->tag_size > MAX_TAG_SIZE) {
3025                 ti->error = "Too big tag size";
3026                 r = -EINVAL;
3027                 goto bad;
3028         }
3029         if (!(ic->tag_size & (ic->tag_size - 1)))
3030                 ic->log2_tag_size = __ffs(ic->tag_size);
3031         else
3032                 ic->log2_tag_size = -1;
3033
3034         ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
3035         ic->autocommit_msec = sync_msec;
3036         timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
3037
3038         ic->io = dm_io_client_create();
3039         if (IS_ERR(ic->io)) {
3040                 r = PTR_ERR(ic->io);
3041                 ic->io = NULL;
3042                 ti->error = "Cannot allocate dm io";
3043                 goto bad;
3044         }
3045
3046         r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache);
3047         if (r) {
3048                 ti->error = "Cannot allocate mempool";
3049                 goto bad;
3050         }
3051
3052         ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
3053                                           WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
3054         if (!ic->metadata_wq) {
3055                 ti->error = "Cannot allocate workqueue";
3056                 r = -ENOMEM;
3057                 goto bad;
3058         }
3059
3060         /*
3061          * If this workqueue were percpu, it would cause bio reordering
3062          * and reduced performance.
3063          */
3064         ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3065         if (!ic->wait_wq) {
3066                 ti->error = "Cannot allocate workqueue";
3067                 r = -ENOMEM;
3068                 goto bad;
3069         }
3070
3071         ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
3072         if (!ic->commit_wq) {
3073                 ti->error = "Cannot allocate workqueue";
3074                 r = -ENOMEM;
3075                 goto bad;
3076         }
3077         INIT_WORK(&ic->commit_work, integrity_commit);
3078
3079         if (ic->mode == 'J') {
3080                 ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
3081                 if (!ic->writer_wq) {
3082                         ti->error = "Cannot allocate workqueue";
3083                         r = -ENOMEM;
3084                         goto bad;
3085                 }
3086                 INIT_WORK(&ic->writer_work, integrity_writer);
3087         }
3088
3089         ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
3090         if (!ic->sb) {
3091                 r = -ENOMEM;
3092                 ti->error = "Cannot allocate superblock area";
3093                 goto bad;
3094         }
3095
3096         r = sync_rw_sb(ic, REQ_OP_READ, 0);
3097         if (r) {
3098                 ti->error = "Error reading superblock";
3099                 goto bad;
3100         }
3101         should_write_sb = false;
3102         if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
3103                 if (ic->mode != 'R') {
3104                         if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
3105                                 r = -EINVAL;
3106                                 ti->error = "The device is not initialized";
3107                                 goto bad;
3108                         }
3109                 }
3110
3111                 r = initialize_superblock(ic, journal_sectors, interleave_sectors);
3112                 if (r) {
3113                         ti->error = "Could not initialize superblock";
3114                         goto bad;
3115                 }
3116                 if (ic->mode != 'R')
3117                         should_write_sb = true;
3118         }
3119
3120         if (ic->sb->version != SB_VERSION) {
3121                 r = -EINVAL;
3122                 ti->error = "Unknown version";
3123                 goto bad;
3124         }
3125         if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
3126                 r = -EINVAL;
3127                 ti->error = "Tag size doesn't match the information in superblock";
3128                 goto bad;
3129         }
3130         if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
3131                 r = -EINVAL;
3132                 ti->error = "Block size doesn't match the information in superblock";
3133                 goto bad;
3134         }
3135         if (!le32_to_cpu(ic->sb->journal_sections)) {
3136                 r = -EINVAL;
3137                 ti->error = "Corrupted superblock, journal_sections is 0";
3138                 goto bad;
3139         }
3140         /* make sure that ti->max_io_len doesn't overflow */
3141         if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
3142             ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
3143                 r = -EINVAL;
3144                 ti->error = "Invalid interleave_sectors in the superblock";
3145                 goto bad;
3146         }
3147         ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
3148         if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
3149                 /* test for overflow */
3150                 r = -EINVAL;
3151                 ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
3152                 goto bad;
3153         }
3154         if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
3155                 r = -EINVAL;
3156                 ti->error = "Journal mac mismatch";
3157                 goto bad;
3158         }
3159         r = calculate_device_limits(ic);
3160         if (r) {
3161                 ti->error = "The device is too small";
3162                 goto bad;
3163         }
3164         if (ti->len > ic->provided_data_sectors) {
3165                 r = -EINVAL;
3166                 ti->error = "Not enough provided sectors for requested mapping size";
3167                 goto bad;
3168         }
3169
3170         if (!buffer_sectors)
3171                 buffer_sectors = 1;
3172         ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT);
3173
3174         threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
3175         threshold += 50;
3176         do_div(threshold, 100);
3177         ic->free_sectors_threshold = threshold;
3178
3179         DEBUG_print("initialized:\n");
3180         DEBUG_print("   integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
3181         DEBUG_print("   journal_entry_size %u\n", ic->journal_entry_size);
3182         DEBUG_print("   journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
3183         DEBUG_print("   journal_section_entries %u\n", ic->journal_section_entries);
3184         DEBUG_print("   journal_section_sectors %u\n", ic->journal_section_sectors);
3185         DEBUG_print("   journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
3186         DEBUG_print("   journal_entries %u\n", ic->journal_entries);
3187         DEBUG_print("   log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
3188         DEBUG_print("   device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
3189         DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
3190         DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
3191         DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
3192         DEBUG_print("   provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
3193                     (unsigned long long)ic->provided_data_sectors);
3194         DEBUG_print("   log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
3195
3196         ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors),
3197                                            1, 0, NULL, NULL);
3198         if (IS_ERR(ic->bufio)) {
3199                 r = PTR_ERR(ic->bufio);
3200                 ti->error = "Cannot initialize dm-bufio";
3201                 ic->bufio = NULL;
3202                 goto bad;
3203         }
3204         dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
3205
3206         if (ic->mode != 'R') {
3207                 r = create_journal(ic, &ti->error);
3208                 if (r)
3209                         goto bad;
3210         }
3211
3212         if (should_write_sb) {
3213                 int r;
3214
3215                 init_journal(ic, 0, ic->journal_sections, 0);
3216                 r = dm_integrity_failed(ic);
3217                 if (unlikely(r)) {
3218                         ti->error = "Error initializing journal";
3219                         goto bad;
3220                 }
3221                 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
3222                 if (r) {
3223                         ti->error = "Error initializing superblock";
3224                         goto bad;
3225                 }
3226                 ic->just_formatted = true;
3227         }
3228
3229         r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
3230         if (r)
3231                 goto bad;
3232
3233         if (!ic->internal_hash)
3234                 dm_integrity_set(ti, ic);
3235
3236         ti->num_flush_bios = 1;
3237         ti->flush_supported = true;
3238
3239         return 0;
3240 bad:
3241         dm_integrity_dtr(ti);
3242         return r;
3243 }
3244
3245 static void dm_integrity_dtr(struct dm_target *ti)
3246 {
3247         struct dm_integrity_c *ic = ti->private;
3248
3249         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
3250         BUG_ON(!list_empty(&ic->wait_list));
3251
3252         if (ic->metadata_wq)
3253                 destroy_workqueue(ic->metadata_wq);
3254         if (ic->wait_wq)
3255                 destroy_workqueue(ic->wait_wq);
3256         if (ic->commit_wq)
3257                 destroy_workqueue(ic->commit_wq);
3258         if (ic->writer_wq)
3259                 destroy_workqueue(ic->writer_wq);
3260         if (ic->bufio)
3261                 dm_bufio_client_destroy(ic->bufio);
3262         mempool_exit(&ic->journal_io_mempool);
3263         if (ic->io)
3264                 dm_io_client_destroy(ic->io);
3265         if (ic->dev)
3266                 dm_put_device(ti, ic->dev);
3267         dm_integrity_free_page_list(ic, ic->journal);
3268         dm_integrity_free_page_list(ic, ic->journal_io);
3269         dm_integrity_free_page_list(ic, ic->journal_xor);
3270         if (ic->journal_scatterlist)
3271                 dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
3272         if (ic->journal_io_scatterlist)
3273                 dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
3274         if (ic->sk_requests) {
3275                 unsigned i;
3276
3277                 for (i = 0; i < ic->journal_sections; i++) {
3278                         struct skcipher_request *req = ic->sk_requests[i];
3279                         if (req) {
3280                                 kzfree(req->iv);
3281                                 skcipher_request_free(req);
3282                         }
3283                 }
3284                 kvfree(ic->sk_requests);
3285         }
3286         kvfree(ic->journal_tree);
3287         if (ic->sb)
3288                 free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
3289
3290         if (ic->internal_hash)
3291                 crypto_free_shash(ic->internal_hash);
3292         free_alg(&ic->internal_hash_alg);
3293
3294         if (ic->journal_crypt)
3295                 crypto_free_skcipher(ic->journal_crypt);
3296         free_alg(&ic->journal_crypt_alg);
3297
3298         if (ic->journal_mac)
3299                 crypto_free_shash(ic->journal_mac);
3300         free_alg(&ic->journal_mac_alg);
3301
3302         kfree(ic);
3303 }
3304
3305 static struct target_type integrity_target = {
3306         .name                   = "integrity",
3307         .version                = {1, 1, 0},
3308         .module                 = THIS_MODULE,
3309         .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
3310         .ctr                    = dm_integrity_ctr,
3311         .dtr                    = dm_integrity_dtr,
3312         .map                    = dm_integrity_map,
3313         .postsuspend            = dm_integrity_postsuspend,
3314         .resume                 = dm_integrity_resume,
3315         .status                 = dm_integrity_status,
3316         .iterate_devices        = dm_integrity_iterate_devices,
3317         .io_hints               = dm_integrity_io_hints,
3318 };
3319
3320 int __init dm_integrity_init(void)
3321 {
3322         int r;
3323
3324         journal_io_cache = kmem_cache_create("integrity_journal_io",
3325                                              sizeof(struct journal_io), 0, 0, NULL);
3326         if (!journal_io_cache) {
3327                 DMERR("can't allocate journal io cache");
3328                 return -ENOMEM;
3329         }
3330
3331         r = dm_register_target(&integrity_target);
3332
3333         if (r < 0)
3334                 DMERR("register failed %d", r);
3335
3336         return r;
3337 }
3338
3339 void dm_integrity_exit(void)
3340 {
3341         dm_unregister_target(&integrity_target);
3342         kmem_cache_destroy(journal_io_cache);
3343 }
3344
3345 module_init(dm_integrity_init);
3346 module_exit(dm_integrity_exit);
3347
3348 MODULE_AUTHOR("Milan Broz");
3349 MODULE_AUTHOR("Mikulas Patocka");
3350 MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
3351 MODULE_LICENSE("GPL");