Linux 6.9-rc1
[linux-2.6-microblaze.git] / drivers / md / dm-log-writes.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2014 Facebook. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9
10 #include <linux/module.h>
11 #include <linux/init.h>
12 #include <linux/blkdev.h>
13 #include <linux/bio.h>
14 #include <linux/dax.h>
15 #include <linux/slab.h>
16 #include <linux/kthread.h>
17 #include <linux/freezer.h>
18 #include <linux/uio.h>
19
20 #define DM_MSG_PREFIX "log-writes"
21
22 /*
23  * This target will sequentially log all writes to the target device onto the
24  * log device.  This is helpful for replaying writes to check for fs consistency
25  * at all times.  This target provides a mechanism to mark specific events to
26  * check data at a later time.  So for example you would:
27  *
28  * write data
29  * fsync
30  * dmsetup message /dev/whatever mark mymark
31  * unmount /mnt/test
32  *
33  * Then replay the log up to mymark and check the contents of the replay to
34  * verify it matches what was written.
35  *
36  * We log writes only after they have been flushed, this makes the log describe
37  * close to the order in which the data hits the actual disk, not its cache.  So
38  * for example the following sequence (W means write, C means complete)
39  *
40  * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
41  *
42  * Would result in the log looking like this:
43  *
44  * c,a,b,flush,fuad,<other writes>,<next flush>
45  *
46  * This is meant to help expose problems where file systems do not properly wait
47  * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
48  * completes it is added to the log as it should be on disk.
49  *
50  * We treat DISCARDs as if they don't bypass cache so that they are logged in
51  * order of completion along with the normal writes.  If we didn't do it this
52  * way we would process all the discards first and then write all the data, when
53  * in fact we want to do the data and the discard in the order that they
54  * completed.
55  */
56 #define LOG_FLUSH_FLAG          (1 << 0)
57 #define LOG_FUA_FLAG            (1 << 1)
58 #define LOG_DISCARD_FLAG        (1 << 2)
59 #define LOG_MARK_FLAG           (1 << 3)
60 #define LOG_METADATA_FLAG       (1 << 4)
61
62 #define WRITE_LOG_VERSION 1ULL
63 #define WRITE_LOG_MAGIC 0x6a736677736872ULL
64 #define WRITE_LOG_SUPER_SECTOR 0
65
66 /*
67  * The disk format for this is braindead simple.
68  *
69  * At byte 0 we have our super, followed by the following sequence for
70  * nr_entries:
71  *
72  * [   1 sector    ][  entry->nr_sectors ]
73  * [log_write_entry][    data written    ]
74  *
75  * The log_write_entry takes up a full sector so we can have arbitrary length
76  * marks and it leaves us room for extra content in the future.
77  */
78
79 /*
80  * Basic info about the log for userspace.
81  */
82 struct log_write_super {
83         __le64 magic;
84         __le64 version;
85         __le64 nr_entries;
86         __le32 sectorsize;
87 };
88
89 /*
90  * sector - the sector we wrote.
91  * nr_sectors - the number of sectors we wrote.
92  * flags - flags for this log entry.
93  * data_len - the size of the data in this log entry, this is for private log
94  * entry stuff, the MARK data provided by userspace for example.
95  */
96 struct log_write_entry {
97         __le64 sector;
98         __le64 nr_sectors;
99         __le64 flags;
100         __le64 data_len;
101 };
102
103 struct log_writes_c {
104         struct dm_dev *dev;
105         struct dm_dev *logdev;
106         u64 logged_entries;
107         u32 sectorsize;
108         u32 sectorshift;
109         atomic_t io_blocks;
110         atomic_t pending_blocks;
111         sector_t next_sector;
112         sector_t end_sector;
113         bool logging_enabled;
114         bool device_supports_discard;
115         spinlock_t blocks_lock;
116         struct list_head unflushed_blocks;
117         struct list_head logging_blocks;
118         wait_queue_head_t wait;
119         struct task_struct *log_kthread;
120         struct completion super_done;
121 };
122
123 struct pending_block {
124         int vec_cnt;
125         u64 flags;
126         sector_t sector;
127         sector_t nr_sectors;
128         char *data;
129         u32 datalen;
130         struct list_head list;
131         struct bio_vec vecs[];
132 };
133
134 struct per_bio_data {
135         struct pending_block *block;
136 };
137
138 static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
139                                           sector_t sectors)
140 {
141         return sectors >> (lc->sectorshift - SECTOR_SHIFT);
142 }
143
144 static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
145                                           sector_t sectors)
146 {
147         return sectors << (lc->sectorshift - SECTOR_SHIFT);
148 }
149
150 static void put_pending_block(struct log_writes_c *lc)
151 {
152         if (atomic_dec_and_test(&lc->pending_blocks)) {
153                 smp_mb__after_atomic();
154                 if (waitqueue_active(&lc->wait))
155                         wake_up(&lc->wait);
156         }
157 }
158
159 static void put_io_block(struct log_writes_c *lc)
160 {
161         if (atomic_dec_and_test(&lc->io_blocks)) {
162                 smp_mb__after_atomic();
163                 if (waitqueue_active(&lc->wait))
164                         wake_up(&lc->wait);
165         }
166 }
167
168 static void log_end_io(struct bio *bio)
169 {
170         struct log_writes_c *lc = bio->bi_private;
171
172         if (bio->bi_status) {
173                 unsigned long flags;
174
175                 DMERR("Error writing log block, error=%d", bio->bi_status);
176                 spin_lock_irqsave(&lc->blocks_lock, flags);
177                 lc->logging_enabled = false;
178                 spin_unlock_irqrestore(&lc->blocks_lock, flags);
179         }
180
181         bio_free_pages(bio);
182         put_io_block(lc);
183         bio_put(bio);
184 }
185
186 static void log_end_super(struct bio *bio)
187 {
188         struct log_writes_c *lc = bio->bi_private;
189
190         complete(&lc->super_done);
191         log_end_io(bio);
192 }
193
194 /*
195  * Meant to be called if there is an error, it will free all the pages
196  * associated with the block.
197  */
198 static void free_pending_block(struct log_writes_c *lc,
199                                struct pending_block *block)
200 {
201         int i;
202
203         for (i = 0; i < block->vec_cnt; i++) {
204                 if (block->vecs[i].bv_page)
205                         __free_page(block->vecs[i].bv_page);
206         }
207         kfree(block->data);
208         kfree(block);
209         put_pending_block(lc);
210 }
211
212 static int write_metadata(struct log_writes_c *lc, void *entry,
213                           size_t entrylen, void *data, size_t datalen,
214                           sector_t sector)
215 {
216         struct bio *bio;
217         struct page *page;
218         void *ptr;
219         size_t ret;
220
221         bio = bio_alloc(lc->logdev->bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
222         bio->bi_iter.bi_size = 0;
223         bio->bi_iter.bi_sector = sector;
224         bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
225                           log_end_super : log_end_io;
226         bio->bi_private = lc;
227
228         page = alloc_page(GFP_KERNEL);
229         if (!page) {
230                 DMERR("Couldn't alloc log page");
231                 bio_put(bio);
232                 goto error;
233         }
234
235         ptr = kmap_local_page(page);
236         memcpy(ptr, entry, entrylen);
237         if (datalen)
238                 memcpy(ptr + entrylen, data, datalen);
239         memset(ptr + entrylen + datalen, 0,
240                lc->sectorsize - entrylen - datalen);
241         kunmap_local(ptr);
242
243         ret = bio_add_page(bio, page, lc->sectorsize, 0);
244         if (ret != lc->sectorsize) {
245                 DMERR("Couldn't add page to the log block");
246                 goto error_bio;
247         }
248         submit_bio(bio);
249         return 0;
250 error_bio:
251         bio_put(bio);
252         __free_page(page);
253 error:
254         put_io_block(lc);
255         return -1;
256 }
257
258 static int write_inline_data(struct log_writes_c *lc, void *entry,
259                              size_t entrylen, void *data, size_t datalen,
260                              sector_t sector)
261 {
262         int bio_pages, pg_datalen, pg_sectorlen, i;
263         struct page *page;
264         struct bio *bio;
265         size_t ret;
266         void *ptr;
267
268         while (datalen) {
269                 bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE));
270
271                 atomic_inc(&lc->io_blocks);
272
273                 bio = bio_alloc(lc->logdev->bdev, bio_pages, REQ_OP_WRITE,
274                                 GFP_KERNEL);
275                 bio->bi_iter.bi_size = 0;
276                 bio->bi_iter.bi_sector = sector;
277                 bio->bi_end_io = log_end_io;
278                 bio->bi_private = lc;
279
280                 for (i = 0; i < bio_pages; i++) {
281                         pg_datalen = min_t(int, datalen, PAGE_SIZE);
282                         pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize);
283
284                         page = alloc_page(GFP_KERNEL);
285                         if (!page) {
286                                 DMERR("Couldn't alloc inline data page");
287                                 goto error_bio;
288                         }
289
290                         ptr = kmap_local_page(page);
291                         memcpy(ptr, data, pg_datalen);
292                         if (pg_sectorlen > pg_datalen)
293                                 memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen);
294                         kunmap_local(ptr);
295
296                         ret = bio_add_page(bio, page, pg_sectorlen, 0);
297                         if (ret != pg_sectorlen) {
298                                 DMERR("Couldn't add page of inline data");
299                                 __free_page(page);
300                                 goto error_bio;
301                         }
302
303                         datalen -= pg_datalen;
304                         data    += pg_datalen;
305                 }
306                 submit_bio(bio);
307
308                 sector += bio_pages * PAGE_SECTORS;
309         }
310         return 0;
311 error_bio:
312         bio_free_pages(bio);
313         bio_put(bio);
314         put_io_block(lc);
315         return -1;
316 }
317
318 static int log_one_block(struct log_writes_c *lc,
319                          struct pending_block *block, sector_t sector)
320 {
321         struct bio *bio;
322         struct log_write_entry entry;
323         size_t metadatalen, ret;
324         int i;
325
326         entry.sector = cpu_to_le64(block->sector);
327         entry.nr_sectors = cpu_to_le64(block->nr_sectors);
328         entry.flags = cpu_to_le64(block->flags);
329         entry.data_len = cpu_to_le64(block->datalen);
330
331         metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0;
332         if (write_metadata(lc, &entry, sizeof(entry), block->data,
333                            metadatalen, sector)) {
334                 free_pending_block(lc, block);
335                 return -1;
336         }
337
338         sector += dev_to_bio_sectors(lc, 1);
339
340         if (block->datalen && metadatalen == 0) {
341                 if (write_inline_data(lc, &entry, sizeof(entry), block->data,
342                                       block->datalen, sector)) {
343                         free_pending_block(lc, block);
344                         return -1;
345                 }
346                 /* we don't support both inline data & bio data */
347                 goto out;
348         }
349
350         if (!block->vec_cnt)
351                 goto out;
352
353         atomic_inc(&lc->io_blocks);
354         bio = bio_alloc(lc->logdev->bdev, bio_max_segs(block->vec_cnt),
355                         REQ_OP_WRITE, GFP_KERNEL);
356         bio->bi_iter.bi_size = 0;
357         bio->bi_iter.bi_sector = sector;
358         bio->bi_end_io = log_end_io;
359         bio->bi_private = lc;
360
361         for (i = 0; i < block->vec_cnt; i++) {
362                 /*
363                  * The page offset is always 0 because we allocate a new page
364                  * for every bvec in the original bio for simplicity sake.
365                  */
366                 ret = bio_add_page(bio, block->vecs[i].bv_page,
367                                    block->vecs[i].bv_len, 0);
368                 if (ret != block->vecs[i].bv_len) {
369                         atomic_inc(&lc->io_blocks);
370                         submit_bio(bio);
371                         bio = bio_alloc(lc->logdev->bdev,
372                                         bio_max_segs(block->vec_cnt - i),
373                                         REQ_OP_WRITE, GFP_KERNEL);
374                         bio->bi_iter.bi_size = 0;
375                         bio->bi_iter.bi_sector = sector;
376                         bio->bi_end_io = log_end_io;
377                         bio->bi_private = lc;
378
379                         ret = bio_add_page(bio, block->vecs[i].bv_page,
380                                            block->vecs[i].bv_len, 0);
381                         if (ret != block->vecs[i].bv_len) {
382                                 DMERR("Couldn't add page on new bio?");
383                                 bio_put(bio);
384                                 goto error;
385                         }
386                 }
387                 sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
388         }
389         submit_bio(bio);
390 out:
391         kfree(block->data);
392         kfree(block);
393         put_pending_block(lc);
394         return 0;
395 error:
396         free_pending_block(lc, block);
397         put_io_block(lc);
398         return -1;
399 }
400
401 static int log_super(struct log_writes_c *lc)
402 {
403         struct log_write_super super;
404
405         super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
406         super.version = cpu_to_le64(WRITE_LOG_VERSION);
407         super.nr_entries = cpu_to_le64(lc->logged_entries);
408         super.sectorsize = cpu_to_le32(lc->sectorsize);
409
410         if (write_metadata(lc, &super, sizeof(super), NULL, 0,
411                            WRITE_LOG_SUPER_SECTOR)) {
412                 DMERR("Couldn't write super");
413                 return -1;
414         }
415
416         /*
417          * Super sector should be writen in-order, otherwise the
418          * nr_entries could be rewritten incorrectly by an old bio.
419          */
420         wait_for_completion_io(&lc->super_done);
421
422         return 0;
423 }
424
425 static inline sector_t logdev_last_sector(struct log_writes_c *lc)
426 {
427         return bdev_nr_sectors(lc->logdev->bdev);
428 }
429
430 static int log_writes_kthread(void *arg)
431 {
432         struct log_writes_c *lc = arg;
433         sector_t sector = 0;
434
435         while (!kthread_should_stop()) {
436                 bool super = false;
437                 bool logging_enabled;
438                 struct pending_block *block = NULL;
439                 int ret;
440
441                 spin_lock_irq(&lc->blocks_lock);
442                 if (!list_empty(&lc->logging_blocks)) {
443                         block = list_first_entry(&lc->logging_blocks,
444                                                  struct pending_block, list);
445                         list_del_init(&block->list);
446                         if (!lc->logging_enabled)
447                                 goto next;
448
449                         sector = lc->next_sector;
450                         if (!(block->flags & LOG_DISCARD_FLAG))
451                                 lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
452                         lc->next_sector += dev_to_bio_sectors(lc, 1);
453
454                         /*
455                          * Apparently the size of the device may not be known
456                          * right away, so handle this properly.
457                          */
458                         if (!lc->end_sector)
459                                 lc->end_sector = logdev_last_sector(lc);
460                         if (lc->end_sector &&
461                             lc->next_sector >= lc->end_sector) {
462                                 DMERR("Ran out of space on the logdev");
463                                 lc->logging_enabled = false;
464                                 goto next;
465                         }
466                         lc->logged_entries++;
467                         atomic_inc(&lc->io_blocks);
468
469                         super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
470                         if (super)
471                                 atomic_inc(&lc->io_blocks);
472                 }
473 next:
474                 logging_enabled = lc->logging_enabled;
475                 spin_unlock_irq(&lc->blocks_lock);
476                 if (block) {
477                         if (logging_enabled) {
478                                 ret = log_one_block(lc, block, sector);
479                                 if (!ret && super)
480                                         ret = log_super(lc);
481                                 if (ret) {
482                                         spin_lock_irq(&lc->blocks_lock);
483                                         lc->logging_enabled = false;
484                                         spin_unlock_irq(&lc->blocks_lock);
485                                 }
486                         } else
487                                 free_pending_block(lc, block);
488                         continue;
489                 }
490
491                 if (!try_to_freeze()) {
492                         set_current_state(TASK_INTERRUPTIBLE);
493                         if (!kthread_should_stop() &&
494                             list_empty(&lc->logging_blocks))
495                                 schedule();
496                         __set_current_state(TASK_RUNNING);
497                 }
498         }
499         return 0;
500 }
501
502 /*
503  * Construct a log-writes mapping:
504  * log-writes <dev_path> <log_dev_path>
505  */
506 static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
507 {
508         struct log_writes_c *lc;
509         struct dm_arg_set as;
510         const char *devname, *logdevname;
511         int ret;
512
513         as.argc = argc;
514         as.argv = argv;
515
516         if (argc < 2) {
517                 ti->error = "Invalid argument count";
518                 return -EINVAL;
519         }
520
521         lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
522         if (!lc) {
523                 ti->error = "Cannot allocate context";
524                 return -ENOMEM;
525         }
526         spin_lock_init(&lc->blocks_lock);
527         INIT_LIST_HEAD(&lc->unflushed_blocks);
528         INIT_LIST_HEAD(&lc->logging_blocks);
529         init_waitqueue_head(&lc->wait);
530         init_completion(&lc->super_done);
531         atomic_set(&lc->io_blocks, 0);
532         atomic_set(&lc->pending_blocks, 0);
533
534         devname = dm_shift_arg(&as);
535         ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev);
536         if (ret) {
537                 ti->error = "Device lookup failed";
538                 goto bad;
539         }
540
541         logdevname = dm_shift_arg(&as);
542         ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table),
543                             &lc->logdev);
544         if (ret) {
545                 ti->error = "Log device lookup failed";
546                 dm_put_device(ti, lc->dev);
547                 goto bad;
548         }
549
550         lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
551         lc->sectorshift = ilog2(lc->sectorsize);
552         lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
553         if (IS_ERR(lc->log_kthread)) {
554                 ret = PTR_ERR(lc->log_kthread);
555                 ti->error = "Couldn't alloc kthread";
556                 dm_put_device(ti, lc->dev);
557                 dm_put_device(ti, lc->logdev);
558                 goto bad;
559         }
560
561         /*
562          * next_sector is in 512b sectors to correspond to what bi_sector expects.
563          * The super starts at sector 0, and the next_sector is the next logical
564          * one based on the sectorsize of the device.
565          */
566         lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
567         lc->logging_enabled = true;
568         lc->end_sector = logdev_last_sector(lc);
569         lc->device_supports_discard = true;
570
571         ti->num_flush_bios = 1;
572         ti->flush_supported = true;
573         ti->num_discard_bios = 1;
574         ti->discards_supported = true;
575         ti->per_io_data_size = sizeof(struct per_bio_data);
576         ti->private = lc;
577         return 0;
578
579 bad:
580         kfree(lc);
581         return ret;
582 }
583
584 static int log_mark(struct log_writes_c *lc, char *data)
585 {
586         struct pending_block *block;
587         size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
588
589         block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
590         if (!block) {
591                 DMERR("Error allocating pending block");
592                 return -ENOMEM;
593         }
594
595         block->data = kstrndup(data, maxsize - 1, GFP_KERNEL);
596         if (!block->data) {
597                 DMERR("Error copying mark data");
598                 kfree(block);
599                 return -ENOMEM;
600         }
601         atomic_inc(&lc->pending_blocks);
602         block->datalen = strlen(block->data);
603         block->flags |= LOG_MARK_FLAG;
604         spin_lock_irq(&lc->blocks_lock);
605         list_add_tail(&block->list, &lc->logging_blocks);
606         spin_unlock_irq(&lc->blocks_lock);
607         wake_up_process(lc->log_kthread);
608         return 0;
609 }
610
611 static void log_writes_dtr(struct dm_target *ti)
612 {
613         struct log_writes_c *lc = ti->private;
614
615         spin_lock_irq(&lc->blocks_lock);
616         list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
617         spin_unlock_irq(&lc->blocks_lock);
618
619         /*
620          * This is just nice to have since it'll update the super to include the
621          * unflushed blocks, if it fails we don't really care.
622          */
623         log_mark(lc, "dm-log-writes-end");
624         wake_up_process(lc->log_kthread);
625         wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
626                    !atomic_read(&lc->pending_blocks));
627         kthread_stop(lc->log_kthread);
628
629         WARN_ON(!list_empty(&lc->logging_blocks));
630         WARN_ON(!list_empty(&lc->unflushed_blocks));
631         dm_put_device(ti, lc->dev);
632         dm_put_device(ti, lc->logdev);
633         kfree(lc);
634 }
635
636 static void normal_map_bio(struct dm_target *ti, struct bio *bio)
637 {
638         struct log_writes_c *lc = ti->private;
639
640         bio_set_dev(bio, lc->dev->bdev);
641 }
642
643 static int log_writes_map(struct dm_target *ti, struct bio *bio)
644 {
645         struct log_writes_c *lc = ti->private;
646         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
647         struct pending_block *block;
648         struct bvec_iter iter;
649         struct bio_vec bv;
650         size_t alloc_size;
651         int i = 0;
652         bool flush_bio = (bio->bi_opf & REQ_PREFLUSH);
653         bool fua_bio = (bio->bi_opf & REQ_FUA);
654         bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
655         bool meta_bio = (bio->bi_opf & REQ_META);
656
657         pb->block = NULL;
658
659         /* Don't bother doing anything if logging has been disabled */
660         if (!lc->logging_enabled)
661                 goto map_bio;
662
663         /*
664          * Map reads as normal.
665          */
666         if (bio_data_dir(bio) == READ)
667                 goto map_bio;
668
669         /* No sectors and not a flush?  Don't care */
670         if (!bio_sectors(bio) && !flush_bio)
671                 goto map_bio;
672
673         /*
674          * Discards will have bi_size set but there's no actual data, so just
675          * allocate the size of the pending block.
676          */
677         if (discard_bio)
678                 alloc_size = sizeof(struct pending_block);
679         else
680                 alloc_size = struct_size(block, vecs, bio_segments(bio));
681
682         block = kzalloc(alloc_size, GFP_NOIO);
683         if (!block) {
684                 DMERR("Error allocating pending block");
685                 spin_lock_irq(&lc->blocks_lock);
686                 lc->logging_enabled = false;
687                 spin_unlock_irq(&lc->blocks_lock);
688                 return DM_MAPIO_KILL;
689         }
690         INIT_LIST_HEAD(&block->list);
691         pb->block = block;
692         atomic_inc(&lc->pending_blocks);
693
694         if (flush_bio)
695                 block->flags |= LOG_FLUSH_FLAG;
696         if (fua_bio)
697                 block->flags |= LOG_FUA_FLAG;
698         if (discard_bio)
699                 block->flags |= LOG_DISCARD_FLAG;
700         if (meta_bio)
701                 block->flags |= LOG_METADATA_FLAG;
702
703         block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
704         block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
705
706         /* We don't need the data, just submit */
707         if (discard_bio) {
708                 WARN_ON(flush_bio || fua_bio);
709                 if (lc->device_supports_discard)
710                         goto map_bio;
711                 bio_endio(bio);
712                 return DM_MAPIO_SUBMITTED;
713         }
714
715         /* Flush bio, splice the unflushed blocks onto this list and submit */
716         if (flush_bio && !bio_sectors(bio)) {
717                 spin_lock_irq(&lc->blocks_lock);
718                 list_splice_init(&lc->unflushed_blocks, &block->list);
719                 spin_unlock_irq(&lc->blocks_lock);
720                 goto map_bio;
721         }
722
723         /*
724          * We will write this bio somewhere else way later so we need to copy
725          * the actual contents into new pages so we know the data will always be
726          * there.
727          *
728          * We do this because this could be a bio from O_DIRECT in which case we
729          * can't just hold onto the page until some later point, we have to
730          * manually copy the contents.
731          */
732         bio_for_each_segment(bv, bio, iter) {
733                 struct page *page;
734                 void *dst;
735
736                 page = alloc_page(GFP_NOIO);
737                 if (!page) {
738                         DMERR("Error allocing page");
739                         free_pending_block(lc, block);
740                         spin_lock_irq(&lc->blocks_lock);
741                         lc->logging_enabled = false;
742                         spin_unlock_irq(&lc->blocks_lock);
743                         return DM_MAPIO_KILL;
744                 }
745
746                 dst = kmap_local_page(page);
747                 memcpy_from_bvec(dst, &bv);
748                 kunmap_local(dst);
749                 block->vecs[i].bv_page = page;
750                 block->vecs[i].bv_len = bv.bv_len;
751                 block->vec_cnt++;
752                 i++;
753         }
754
755         /* Had a flush with data in it, weird */
756         if (flush_bio) {
757                 spin_lock_irq(&lc->blocks_lock);
758                 list_splice_init(&lc->unflushed_blocks, &block->list);
759                 spin_unlock_irq(&lc->blocks_lock);
760         }
761 map_bio:
762         normal_map_bio(ti, bio);
763         return DM_MAPIO_REMAPPED;
764 }
765
766 static int normal_end_io(struct dm_target *ti, struct bio *bio,
767                 blk_status_t *error)
768 {
769         struct log_writes_c *lc = ti->private;
770         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
771
772         if (bio_data_dir(bio) == WRITE && pb->block) {
773                 struct pending_block *block = pb->block;
774                 unsigned long flags;
775
776                 spin_lock_irqsave(&lc->blocks_lock, flags);
777                 if (block->flags & LOG_FLUSH_FLAG) {
778                         list_splice_tail_init(&block->list, &lc->logging_blocks);
779                         list_add_tail(&block->list, &lc->logging_blocks);
780                         wake_up_process(lc->log_kthread);
781                 } else if (block->flags & LOG_FUA_FLAG) {
782                         list_add_tail(&block->list, &lc->logging_blocks);
783                         wake_up_process(lc->log_kthread);
784                 } else
785                         list_add_tail(&block->list, &lc->unflushed_blocks);
786                 spin_unlock_irqrestore(&lc->blocks_lock, flags);
787         }
788
789         return DM_ENDIO_DONE;
790 }
791
792 /*
793  * INFO format: <logged entries> <highest allocated sector>
794  */
795 static void log_writes_status(struct dm_target *ti, status_type_t type,
796                               unsigned int status_flags, char *result,
797                               unsigned int maxlen)
798 {
799         unsigned int sz = 0;
800         struct log_writes_c *lc = ti->private;
801
802         switch (type) {
803         case STATUSTYPE_INFO:
804                 DMEMIT("%llu %llu", lc->logged_entries,
805                        (unsigned long long)lc->next_sector - 1);
806                 if (!lc->logging_enabled)
807                         DMEMIT(" logging_disabled");
808                 break;
809
810         case STATUSTYPE_TABLE:
811                 DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
812                 break;
813
814         case STATUSTYPE_IMA:
815                 *result = '\0';
816                 break;
817         }
818 }
819
820 static int log_writes_prepare_ioctl(struct dm_target *ti,
821                                     struct block_device **bdev)
822 {
823         struct log_writes_c *lc = ti->private;
824         struct dm_dev *dev = lc->dev;
825
826         *bdev = dev->bdev;
827         /*
828          * Only pass ioctls through if the device sizes match exactly.
829          */
830         if (ti->len != bdev_nr_sectors(dev->bdev))
831                 return 1;
832         return 0;
833 }
834
835 static int log_writes_iterate_devices(struct dm_target *ti,
836                                       iterate_devices_callout_fn fn,
837                                       void *data)
838 {
839         struct log_writes_c *lc = ti->private;
840
841         return fn(ti, lc->dev, 0, ti->len, data);
842 }
843
844 /*
845  * Messages supported:
846  *   mark <mark data> - specify the marked data.
847  */
848 static int log_writes_message(struct dm_target *ti, unsigned int argc, char **argv,
849                               char *result, unsigned int maxlen)
850 {
851         int r = -EINVAL;
852         struct log_writes_c *lc = ti->private;
853
854         if (argc != 2) {
855                 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
856                 return r;
857         }
858
859         if (!strcasecmp(argv[0], "mark"))
860                 r = log_mark(lc, argv[1]);
861         else
862                 DMWARN("Unrecognised log writes target message received: %s", argv[0]);
863
864         return r;
865 }
866
867 static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
868 {
869         struct log_writes_c *lc = ti->private;
870
871         if (!bdev_max_discard_sectors(lc->dev->bdev)) {
872                 lc->device_supports_discard = false;
873                 limits->discard_granularity = lc->sectorsize;
874                 limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
875         }
876         limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
877         limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
878         limits->io_min = limits->physical_block_size;
879         limits->dma_alignment = limits->logical_block_size - 1;
880 }
881
882 #if IS_ENABLED(CONFIG_FS_DAX)
883 static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti,
884                 pgoff_t *pgoff)
885 {
886         struct log_writes_c *lc = ti->private;
887
888         *pgoff += (get_start_sect(lc->dev->bdev) >> PAGE_SECTORS_SHIFT);
889         return lc->dev->dax_dev;
890 }
891
892 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
893                 long nr_pages, enum dax_access_mode mode, void **kaddr,
894                 pfn_t *pfn)
895 {
896         struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
897
898         return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn);
899 }
900
901 static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
902                                           size_t nr_pages)
903 {
904         struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
905
906         return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT);
907 }
908
909 static size_t log_writes_dax_recovery_write(struct dm_target *ti,
910                 pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
911 {
912         struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
913
914         return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
915 }
916
917 #else
918 #define log_writes_dax_direct_access NULL
919 #define log_writes_dax_zero_page_range NULL
920 #define log_writes_dax_recovery_write NULL
921 #endif
922
923 static struct target_type log_writes_target = {
924         .name   = "log-writes",
925         .version = {1, 1, 0},
926         .module = THIS_MODULE,
927         .ctr    = log_writes_ctr,
928         .dtr    = log_writes_dtr,
929         .map    = log_writes_map,
930         .end_io = normal_end_io,
931         .status = log_writes_status,
932         .prepare_ioctl = log_writes_prepare_ioctl,
933         .message = log_writes_message,
934         .iterate_devices = log_writes_iterate_devices,
935         .io_hints = log_writes_io_hints,
936         .direct_access = log_writes_dax_direct_access,
937         .dax_zero_page_range = log_writes_dax_zero_page_range,
938         .dax_recovery_write = log_writes_dax_recovery_write,
939 };
940 module_dm(log_writes);
941
942 MODULE_DESCRIPTION(DM_NAME " log writes target");
943 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
944 MODULE_LICENSE("GPL");