net: dsa: realtek: rtl8365mb: remove learn_limit_max private data member
[linux-2.6-microblaze.git] / drivers / md / dm-log-writes.c
1 /*
2  * Copyright (C) 2014 Facebook. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include <linux/device-mapper.h>
8
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/blkdev.h>
12 #include <linux/bio.h>
13 #include <linux/dax.h>
14 #include <linux/slab.h>
15 #include <linux/kthread.h>
16 #include <linux/freezer.h>
17 #include <linux/uio.h>
18
19 #define DM_MSG_PREFIX "log-writes"
20
21 /*
22  * This target will sequentially log all writes to the target device onto the
23  * log device.  This is helpful for replaying writes to check for fs consistency
24  * at all times.  This target provides a mechanism to mark specific events to
25  * check data at a later time.  So for example you would:
26  *
27  * write data
28  * fsync
29  * dmsetup message /dev/whatever mark mymark
30  * unmount /mnt/test
31  *
32  * Then replay the log up to mymark and check the contents of the replay to
33  * verify it matches what was written.
34  *
35  * We log writes only after they have been flushed, this makes the log describe
36  * close to the order in which the data hits the actual disk, not its cache.  So
37  * for example the following sequence (W means write, C means complete)
38  *
39  * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
40  *
41  * Would result in the log looking like this:
42  *
43  * c,a,b,flush,fuad,<other writes>,<next flush>
44  *
45  * This is meant to help expose problems where file systems do not properly wait
46  * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
47  * completes it is added to the log as it should be on disk.
48  *
49  * We treat DISCARDs as if they don't bypass cache so that they are logged in
50  * order of completion along with the normal writes.  If we didn't do it this
51  * way we would process all the discards first and then write all the data, when
52  * in fact we want to do the data and the discard in the order that they
53  * completed.
54  */
55 #define LOG_FLUSH_FLAG          (1 << 0)
56 #define LOG_FUA_FLAG            (1 << 1)
57 #define LOG_DISCARD_FLAG        (1 << 2)
58 #define LOG_MARK_FLAG           (1 << 3)
59 #define LOG_METADATA_FLAG       (1 << 4)
60
61 #define WRITE_LOG_VERSION 1ULL
62 #define WRITE_LOG_MAGIC 0x6a736677736872ULL
63 #define WRITE_LOG_SUPER_SECTOR 0
64
65 /*
66  * The disk format for this is braindead simple.
67  *
68  * At byte 0 we have our super, followed by the following sequence for
69  * nr_entries:
70  *
71  * [   1 sector    ][  entry->nr_sectors ]
72  * [log_write_entry][    data written    ]
73  *
74  * The log_write_entry takes up a full sector so we can have arbitrary length
75  * marks and it leaves us room for extra content in the future.
76  */
77
78 /*
79  * Basic info about the log for userspace.
80  */
81 struct log_write_super {
82         __le64 magic;
83         __le64 version;
84         __le64 nr_entries;
85         __le32 sectorsize;
86 };
87
88 /*
89  * sector - the sector we wrote.
90  * nr_sectors - the number of sectors we wrote.
91  * flags - flags for this log entry.
92  * data_len - the size of the data in this log entry, this is for private log
93  * entry stuff, the MARK data provided by userspace for example.
94  */
95 struct log_write_entry {
96         __le64 sector;
97         __le64 nr_sectors;
98         __le64 flags;
99         __le64 data_len;
100 };
101
102 struct log_writes_c {
103         struct dm_dev *dev;
104         struct dm_dev *logdev;
105         u64 logged_entries;
106         u32 sectorsize;
107         u32 sectorshift;
108         atomic_t io_blocks;
109         atomic_t pending_blocks;
110         sector_t next_sector;
111         sector_t end_sector;
112         bool logging_enabled;
113         bool device_supports_discard;
114         spinlock_t blocks_lock;
115         struct list_head unflushed_blocks;
116         struct list_head logging_blocks;
117         wait_queue_head_t wait;
118         struct task_struct *log_kthread;
119         struct completion super_done;
120 };
121
122 struct pending_block {
123         int vec_cnt;
124         u64 flags;
125         sector_t sector;
126         sector_t nr_sectors;
127         char *data;
128         u32 datalen;
129         struct list_head list;
130         struct bio_vec vecs[];
131 };
132
133 struct per_bio_data {
134         struct pending_block *block;
135 };
136
137 static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
138                                           sector_t sectors)
139 {
140         return sectors >> (lc->sectorshift - SECTOR_SHIFT);
141 }
142
143 static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
144                                           sector_t sectors)
145 {
146         return sectors << (lc->sectorshift - SECTOR_SHIFT);
147 }
148
149 static void put_pending_block(struct log_writes_c *lc)
150 {
151         if (atomic_dec_and_test(&lc->pending_blocks)) {
152                 smp_mb__after_atomic();
153                 if (waitqueue_active(&lc->wait))
154                         wake_up(&lc->wait);
155         }
156 }
157
158 static void put_io_block(struct log_writes_c *lc)
159 {
160         if (atomic_dec_and_test(&lc->io_blocks)) {
161                 smp_mb__after_atomic();
162                 if (waitqueue_active(&lc->wait))
163                         wake_up(&lc->wait);
164         }
165 }
166
167 static void log_end_io(struct bio *bio)
168 {
169         struct log_writes_c *lc = bio->bi_private;
170
171         if (bio->bi_status) {
172                 unsigned long flags;
173
174                 DMERR("Error writing log block, error=%d", bio->bi_status);
175                 spin_lock_irqsave(&lc->blocks_lock, flags);
176                 lc->logging_enabled = false;
177                 spin_unlock_irqrestore(&lc->blocks_lock, flags);
178         }
179
180         bio_free_pages(bio);
181         put_io_block(lc);
182         bio_put(bio);
183 }
184
185 static void log_end_super(struct bio *bio)
186 {
187         struct log_writes_c *lc = bio->bi_private;
188
189         complete(&lc->super_done);
190         log_end_io(bio);
191 }
192
193 /*
194  * Meant to be called if there is an error, it will free all the pages
195  * associated with the block.
196  */
197 static void free_pending_block(struct log_writes_c *lc,
198                                struct pending_block *block)
199 {
200         int i;
201
202         for (i = 0; i < block->vec_cnt; i++) {
203                 if (block->vecs[i].bv_page)
204                         __free_page(block->vecs[i].bv_page);
205         }
206         kfree(block->data);
207         kfree(block);
208         put_pending_block(lc);
209 }
210
211 static int write_metadata(struct log_writes_c *lc, void *entry,
212                           size_t entrylen, void *data, size_t datalen,
213                           sector_t sector)
214 {
215         struct bio *bio;
216         struct page *page;
217         void *ptr;
218         size_t ret;
219
220         bio = bio_alloc(lc->logdev->bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
221         bio->bi_iter.bi_size = 0;
222         bio->bi_iter.bi_sector = sector;
223         bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
224                           log_end_super : log_end_io;
225         bio->bi_private = lc;
226
227         page = alloc_page(GFP_KERNEL);
228         if (!page) {
229                 DMERR("Couldn't alloc log page");
230                 bio_put(bio);
231                 goto error;
232         }
233
234         ptr = kmap_atomic(page);
235         memcpy(ptr, entry, entrylen);
236         if (datalen)
237                 memcpy(ptr + entrylen, data, datalen);
238         memset(ptr + entrylen + datalen, 0,
239                lc->sectorsize - entrylen - datalen);
240         kunmap_atomic(ptr);
241
242         ret = bio_add_page(bio, page, lc->sectorsize, 0);
243         if (ret != lc->sectorsize) {
244                 DMERR("Couldn't add page to the log block");
245                 goto error_bio;
246         }
247         submit_bio(bio);
248         return 0;
249 error_bio:
250         bio_put(bio);
251         __free_page(page);
252 error:
253         put_io_block(lc);
254         return -1;
255 }
256
257 static int write_inline_data(struct log_writes_c *lc, void *entry,
258                              size_t entrylen, void *data, size_t datalen,
259                              sector_t sector)
260 {
261         int bio_pages, pg_datalen, pg_sectorlen, i;
262         struct page *page;
263         struct bio *bio;
264         size_t ret;
265         void *ptr;
266
267         while (datalen) {
268                 bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE));
269
270                 atomic_inc(&lc->io_blocks);
271
272                 bio = bio_alloc(lc->logdev->bdev, bio_pages, REQ_OP_WRITE,
273                                 GFP_KERNEL);
274                 bio->bi_iter.bi_size = 0;
275                 bio->bi_iter.bi_sector = sector;
276                 bio->bi_end_io = log_end_io;
277                 bio->bi_private = lc;
278
279                 for (i = 0; i < bio_pages; i++) {
280                         pg_datalen = min_t(int, datalen, PAGE_SIZE);
281                         pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize);
282
283                         page = alloc_page(GFP_KERNEL);
284                         if (!page) {
285                                 DMERR("Couldn't alloc inline data page");
286                                 goto error_bio;
287                         }
288
289                         ptr = kmap_atomic(page);
290                         memcpy(ptr, data, pg_datalen);
291                         if (pg_sectorlen > pg_datalen)
292                                 memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen);
293                         kunmap_atomic(ptr);
294
295                         ret = bio_add_page(bio, page, pg_sectorlen, 0);
296                         if (ret != pg_sectorlen) {
297                                 DMERR("Couldn't add page of inline data");
298                                 __free_page(page);
299                                 goto error_bio;
300                         }
301
302                         datalen -= pg_datalen;
303                         data    += pg_datalen;
304                 }
305                 submit_bio(bio);
306
307                 sector += bio_pages * PAGE_SECTORS;
308         }
309         return 0;
310 error_bio:
311         bio_free_pages(bio);
312         bio_put(bio);
313         put_io_block(lc);
314         return -1;
315 }
316
317 static int log_one_block(struct log_writes_c *lc,
318                          struct pending_block *block, sector_t sector)
319 {
320         struct bio *bio;
321         struct log_write_entry entry;
322         size_t metadatalen, ret;
323         int i;
324
325         entry.sector = cpu_to_le64(block->sector);
326         entry.nr_sectors = cpu_to_le64(block->nr_sectors);
327         entry.flags = cpu_to_le64(block->flags);
328         entry.data_len = cpu_to_le64(block->datalen);
329
330         metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0;
331         if (write_metadata(lc, &entry, sizeof(entry), block->data,
332                            metadatalen, sector)) {
333                 free_pending_block(lc, block);
334                 return -1;
335         }
336
337         sector += dev_to_bio_sectors(lc, 1);
338
339         if (block->datalen && metadatalen == 0) {
340                 if (write_inline_data(lc, &entry, sizeof(entry), block->data,
341                                       block->datalen, sector)) {
342                         free_pending_block(lc, block);
343                         return -1;
344                 }
345                 /* we don't support both inline data & bio data */
346                 goto out;
347         }
348
349         if (!block->vec_cnt)
350                 goto out;
351
352         atomic_inc(&lc->io_blocks);
353         bio = bio_alloc(lc->logdev->bdev, bio_max_segs(block->vec_cnt),
354                         REQ_OP_WRITE, GFP_KERNEL);
355         bio->bi_iter.bi_size = 0;
356         bio->bi_iter.bi_sector = sector;
357         bio->bi_end_io = log_end_io;
358         bio->bi_private = lc;
359
360         for (i = 0; i < block->vec_cnt; i++) {
361                 /*
362                  * The page offset is always 0 because we allocate a new page
363                  * for every bvec in the original bio for simplicity sake.
364                  */
365                 ret = bio_add_page(bio, block->vecs[i].bv_page,
366                                    block->vecs[i].bv_len, 0);
367                 if (ret != block->vecs[i].bv_len) {
368                         atomic_inc(&lc->io_blocks);
369                         submit_bio(bio);
370                         bio = bio_alloc(lc->logdev->bdev,
371                                         bio_max_segs(block->vec_cnt - i),
372                                         REQ_OP_WRITE, GFP_KERNEL);
373                         bio->bi_iter.bi_size = 0;
374                         bio->bi_iter.bi_sector = sector;
375                         bio->bi_end_io = log_end_io;
376                         bio->bi_private = lc;
377
378                         ret = bio_add_page(bio, block->vecs[i].bv_page,
379                                            block->vecs[i].bv_len, 0);
380                         if (ret != block->vecs[i].bv_len) {
381                                 DMERR("Couldn't add page on new bio?");
382                                 bio_put(bio);
383                                 goto error;
384                         }
385                 }
386                 sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
387         }
388         submit_bio(bio);
389 out:
390         kfree(block->data);
391         kfree(block);
392         put_pending_block(lc);
393         return 0;
394 error:
395         free_pending_block(lc, block);
396         put_io_block(lc);
397         return -1;
398 }
399
400 static int log_super(struct log_writes_c *lc)
401 {
402         struct log_write_super super;
403
404         super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
405         super.version = cpu_to_le64(WRITE_LOG_VERSION);
406         super.nr_entries = cpu_to_le64(lc->logged_entries);
407         super.sectorsize = cpu_to_le32(lc->sectorsize);
408
409         if (write_metadata(lc, &super, sizeof(super), NULL, 0,
410                            WRITE_LOG_SUPER_SECTOR)) {
411                 DMERR("Couldn't write super");
412                 return -1;
413         }
414
415         /*
416          * Super sector should be writen in-order, otherwise the
417          * nr_entries could be rewritten incorrectly by an old bio.
418          */
419         wait_for_completion_io(&lc->super_done);
420
421         return 0;
422 }
423
424 static inline sector_t logdev_last_sector(struct log_writes_c *lc)
425 {
426         return bdev_nr_sectors(lc->logdev->bdev);
427 }
428
429 static int log_writes_kthread(void *arg)
430 {
431         struct log_writes_c *lc = (struct log_writes_c *)arg;
432         sector_t sector = 0;
433
434         while (!kthread_should_stop()) {
435                 bool super = false;
436                 bool logging_enabled;
437                 struct pending_block *block = NULL;
438                 int ret;
439
440                 spin_lock_irq(&lc->blocks_lock);
441                 if (!list_empty(&lc->logging_blocks)) {
442                         block = list_first_entry(&lc->logging_blocks,
443                                                  struct pending_block, list);
444                         list_del_init(&block->list);
445                         if (!lc->logging_enabled)
446                                 goto next;
447
448                         sector = lc->next_sector;
449                         if (!(block->flags & LOG_DISCARD_FLAG))
450                                 lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
451                         lc->next_sector += dev_to_bio_sectors(lc, 1);
452
453                         /*
454                          * Apparently the size of the device may not be known
455                          * right away, so handle this properly.
456                          */
457                         if (!lc->end_sector)
458                                 lc->end_sector = logdev_last_sector(lc);
459                         if (lc->end_sector &&
460                             lc->next_sector >= lc->end_sector) {
461                                 DMERR("Ran out of space on the logdev");
462                                 lc->logging_enabled = false;
463                                 goto next;
464                         }
465                         lc->logged_entries++;
466                         atomic_inc(&lc->io_blocks);
467
468                         super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
469                         if (super)
470                                 atomic_inc(&lc->io_blocks);
471                 }
472 next:
473                 logging_enabled = lc->logging_enabled;
474                 spin_unlock_irq(&lc->blocks_lock);
475                 if (block) {
476                         if (logging_enabled) {
477                                 ret = log_one_block(lc, block, sector);
478                                 if (!ret && super)
479                                         ret = log_super(lc);
480                                 if (ret) {
481                                         spin_lock_irq(&lc->blocks_lock);
482                                         lc->logging_enabled = false;
483                                         spin_unlock_irq(&lc->blocks_lock);
484                                 }
485                         } else
486                                 free_pending_block(lc, block);
487                         continue;
488                 }
489
490                 if (!try_to_freeze()) {
491                         set_current_state(TASK_INTERRUPTIBLE);
492                         if (!kthread_should_stop() &&
493                             list_empty(&lc->logging_blocks))
494                                 schedule();
495                         __set_current_state(TASK_RUNNING);
496                 }
497         }
498         return 0;
499 }
500
501 /*
502  * Construct a log-writes mapping:
503  * log-writes <dev_path> <log_dev_path>
504  */
505 static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
506 {
507         struct log_writes_c *lc;
508         struct dm_arg_set as;
509         const char *devname, *logdevname;
510         int ret;
511
512         as.argc = argc;
513         as.argv = argv;
514
515         if (argc < 2) {
516                 ti->error = "Invalid argument count";
517                 return -EINVAL;
518         }
519
520         lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
521         if (!lc) {
522                 ti->error = "Cannot allocate context";
523                 return -ENOMEM;
524         }
525         spin_lock_init(&lc->blocks_lock);
526         INIT_LIST_HEAD(&lc->unflushed_blocks);
527         INIT_LIST_HEAD(&lc->logging_blocks);
528         init_waitqueue_head(&lc->wait);
529         init_completion(&lc->super_done);
530         atomic_set(&lc->io_blocks, 0);
531         atomic_set(&lc->pending_blocks, 0);
532
533         devname = dm_shift_arg(&as);
534         ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev);
535         if (ret) {
536                 ti->error = "Device lookup failed";
537                 goto bad;
538         }
539
540         logdevname = dm_shift_arg(&as);
541         ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table),
542                             &lc->logdev);
543         if (ret) {
544                 ti->error = "Log device lookup failed";
545                 dm_put_device(ti, lc->dev);
546                 goto bad;
547         }
548
549         lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
550         lc->sectorshift = ilog2(lc->sectorsize);
551         lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
552         if (IS_ERR(lc->log_kthread)) {
553                 ret = PTR_ERR(lc->log_kthread);
554                 ti->error = "Couldn't alloc kthread";
555                 dm_put_device(ti, lc->dev);
556                 dm_put_device(ti, lc->logdev);
557                 goto bad;
558         }
559
560         /*
561          * next_sector is in 512b sectors to correspond to what bi_sector expects.
562          * The super starts at sector 0, and the next_sector is the next logical
563          * one based on the sectorsize of the device.
564          */
565         lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
566         lc->logging_enabled = true;
567         lc->end_sector = logdev_last_sector(lc);
568         lc->device_supports_discard = true;
569
570         ti->num_flush_bios = 1;
571         ti->flush_supported = true;
572         ti->num_discard_bios = 1;
573         ti->discards_supported = true;
574         ti->per_io_data_size = sizeof(struct per_bio_data);
575         ti->private = lc;
576         return 0;
577
578 bad:
579         kfree(lc);
580         return ret;
581 }
582
583 static int log_mark(struct log_writes_c *lc, char *data)
584 {
585         struct pending_block *block;
586         size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
587
588         block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
589         if (!block) {
590                 DMERR("Error allocating pending block");
591                 return -ENOMEM;
592         }
593
594         block->data = kstrndup(data, maxsize - 1, GFP_KERNEL);
595         if (!block->data) {
596                 DMERR("Error copying mark data");
597                 kfree(block);
598                 return -ENOMEM;
599         }
600         atomic_inc(&lc->pending_blocks);
601         block->datalen = strlen(block->data);
602         block->flags |= LOG_MARK_FLAG;
603         spin_lock_irq(&lc->blocks_lock);
604         list_add_tail(&block->list, &lc->logging_blocks);
605         spin_unlock_irq(&lc->blocks_lock);
606         wake_up_process(lc->log_kthread);
607         return 0;
608 }
609
610 static void log_writes_dtr(struct dm_target *ti)
611 {
612         struct log_writes_c *lc = ti->private;
613
614         spin_lock_irq(&lc->blocks_lock);
615         list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
616         spin_unlock_irq(&lc->blocks_lock);
617
618         /*
619          * This is just nice to have since it'll update the super to include the
620          * unflushed blocks, if it fails we don't really care.
621          */
622         log_mark(lc, "dm-log-writes-end");
623         wake_up_process(lc->log_kthread);
624         wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
625                    !atomic_read(&lc->pending_blocks));
626         kthread_stop(lc->log_kthread);
627
628         WARN_ON(!list_empty(&lc->logging_blocks));
629         WARN_ON(!list_empty(&lc->unflushed_blocks));
630         dm_put_device(ti, lc->dev);
631         dm_put_device(ti, lc->logdev);
632         kfree(lc);
633 }
634
635 static void normal_map_bio(struct dm_target *ti, struct bio *bio)
636 {
637         struct log_writes_c *lc = ti->private;
638
639         bio_set_dev(bio, lc->dev->bdev);
640 }
641
642 static int log_writes_map(struct dm_target *ti, struct bio *bio)
643 {
644         struct log_writes_c *lc = ti->private;
645         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
646         struct pending_block *block;
647         struct bvec_iter iter;
648         struct bio_vec bv;
649         size_t alloc_size;
650         int i = 0;
651         bool flush_bio = (bio->bi_opf & REQ_PREFLUSH);
652         bool fua_bio = (bio->bi_opf & REQ_FUA);
653         bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
654         bool meta_bio = (bio->bi_opf & REQ_META);
655
656         pb->block = NULL;
657
658         /* Don't bother doing anything if logging has been disabled */
659         if (!lc->logging_enabled)
660                 goto map_bio;
661
662         /*
663          * Map reads as normal.
664          */
665         if (bio_data_dir(bio) == READ)
666                 goto map_bio;
667
668         /* No sectors and not a flush?  Don't care */
669         if (!bio_sectors(bio) && !flush_bio)
670                 goto map_bio;
671
672         /*
673          * Discards will have bi_size set but there's no actual data, so just
674          * allocate the size of the pending block.
675          */
676         if (discard_bio)
677                 alloc_size = sizeof(struct pending_block);
678         else
679                 alloc_size = struct_size(block, vecs, bio_segments(bio));
680
681         block = kzalloc(alloc_size, GFP_NOIO);
682         if (!block) {
683                 DMERR("Error allocating pending block");
684                 spin_lock_irq(&lc->blocks_lock);
685                 lc->logging_enabled = false;
686                 spin_unlock_irq(&lc->blocks_lock);
687                 return DM_MAPIO_KILL;
688         }
689         INIT_LIST_HEAD(&block->list);
690         pb->block = block;
691         atomic_inc(&lc->pending_blocks);
692
693         if (flush_bio)
694                 block->flags |= LOG_FLUSH_FLAG;
695         if (fua_bio)
696                 block->flags |= LOG_FUA_FLAG;
697         if (discard_bio)
698                 block->flags |= LOG_DISCARD_FLAG;
699         if (meta_bio)
700                 block->flags |= LOG_METADATA_FLAG;
701
702         block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
703         block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
704
705         /* We don't need the data, just submit */
706         if (discard_bio) {
707                 WARN_ON(flush_bio || fua_bio);
708                 if (lc->device_supports_discard)
709                         goto map_bio;
710                 bio_endio(bio);
711                 return DM_MAPIO_SUBMITTED;
712         }
713
714         /* Flush bio, splice the unflushed blocks onto this list and submit */
715         if (flush_bio && !bio_sectors(bio)) {
716                 spin_lock_irq(&lc->blocks_lock);
717                 list_splice_init(&lc->unflushed_blocks, &block->list);
718                 spin_unlock_irq(&lc->blocks_lock);
719                 goto map_bio;
720         }
721
722         /*
723          * We will write this bio somewhere else way later so we need to copy
724          * the actual contents into new pages so we know the data will always be
725          * there.
726          *
727          * We do this because this could be a bio from O_DIRECT in which case we
728          * can't just hold onto the page until some later point, we have to
729          * manually copy the contents.
730          */
731         bio_for_each_segment(bv, bio, iter) {
732                 struct page *page;
733                 void *dst;
734
735                 page = alloc_page(GFP_NOIO);
736                 if (!page) {
737                         DMERR("Error allocing page");
738                         free_pending_block(lc, block);
739                         spin_lock_irq(&lc->blocks_lock);
740                         lc->logging_enabled = false;
741                         spin_unlock_irq(&lc->blocks_lock);
742                         return DM_MAPIO_KILL;
743                 }
744
745                 dst = kmap_atomic(page);
746                 memcpy_from_bvec(dst, &bv);
747                 kunmap_atomic(dst);
748                 block->vecs[i].bv_page = page;
749                 block->vecs[i].bv_len = bv.bv_len;
750                 block->vec_cnt++;
751                 i++;
752         }
753
754         /* Had a flush with data in it, weird */
755         if (flush_bio) {
756                 spin_lock_irq(&lc->blocks_lock);
757                 list_splice_init(&lc->unflushed_blocks, &block->list);
758                 spin_unlock_irq(&lc->blocks_lock);
759         }
760 map_bio:
761         normal_map_bio(ti, bio);
762         return DM_MAPIO_REMAPPED;
763 }
764
765 static int normal_end_io(struct dm_target *ti, struct bio *bio,
766                 blk_status_t *error)
767 {
768         struct log_writes_c *lc = ti->private;
769         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
770
771         if (bio_data_dir(bio) == WRITE && pb->block) {
772                 struct pending_block *block = pb->block;
773                 unsigned long flags;
774
775                 spin_lock_irqsave(&lc->blocks_lock, flags);
776                 if (block->flags & LOG_FLUSH_FLAG) {
777                         list_splice_tail_init(&block->list, &lc->logging_blocks);
778                         list_add_tail(&block->list, &lc->logging_blocks);
779                         wake_up_process(lc->log_kthread);
780                 } else if (block->flags & LOG_FUA_FLAG) {
781                         list_add_tail(&block->list, &lc->logging_blocks);
782                         wake_up_process(lc->log_kthread);
783                 } else
784                         list_add_tail(&block->list, &lc->unflushed_blocks);
785                 spin_unlock_irqrestore(&lc->blocks_lock, flags);
786         }
787
788         return DM_ENDIO_DONE;
789 }
790
791 /*
792  * INFO format: <logged entries> <highest allocated sector>
793  */
794 static void log_writes_status(struct dm_target *ti, status_type_t type,
795                               unsigned status_flags, char *result,
796                               unsigned maxlen)
797 {
798         unsigned sz = 0;
799         struct log_writes_c *lc = ti->private;
800
801         switch (type) {
802         case STATUSTYPE_INFO:
803                 DMEMIT("%llu %llu", lc->logged_entries,
804                        (unsigned long long)lc->next_sector - 1);
805                 if (!lc->logging_enabled)
806                         DMEMIT(" logging_disabled");
807                 break;
808
809         case STATUSTYPE_TABLE:
810                 DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
811                 break;
812
813         case STATUSTYPE_IMA:
814                 *result = '\0';
815                 break;
816         }
817 }
818
819 static int log_writes_prepare_ioctl(struct dm_target *ti,
820                                     struct block_device **bdev)
821 {
822         struct log_writes_c *lc = ti->private;
823         struct dm_dev *dev = lc->dev;
824
825         *bdev = dev->bdev;
826         /*
827          * Only pass ioctls through if the device sizes match exactly.
828          */
829         if (ti->len != bdev_nr_sectors(dev->bdev))
830                 return 1;
831         return 0;
832 }
833
834 static int log_writes_iterate_devices(struct dm_target *ti,
835                                       iterate_devices_callout_fn fn,
836                                       void *data)
837 {
838         struct log_writes_c *lc = ti->private;
839
840         return fn(ti, lc->dev, 0, ti->len, data);
841 }
842
843 /*
844  * Messages supported:
845  *   mark <mark data> - specify the marked data.
846  */
847 static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv,
848                               char *result, unsigned maxlen)
849 {
850         int r = -EINVAL;
851         struct log_writes_c *lc = ti->private;
852
853         if (argc != 2) {
854                 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
855                 return r;
856         }
857
858         if (!strcasecmp(argv[0], "mark"))
859                 r = log_mark(lc, argv[1]);
860         else
861                 DMWARN("Unrecognised log writes target message received: %s", argv[0]);
862
863         return r;
864 }
865
866 static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
867 {
868         struct log_writes_c *lc = ti->private;
869
870         if (!bdev_max_discard_sectors(lc->dev->bdev)) {
871                 lc->device_supports_discard = false;
872                 limits->discard_granularity = lc->sectorsize;
873                 limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
874         }
875         limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
876         limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
877         limits->io_min = limits->physical_block_size;
878 }
879
880 #if IS_ENABLED(CONFIG_FS_DAX)
881 static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti,
882                 pgoff_t *pgoff)
883 {
884         struct log_writes_c *lc = ti->private;
885
886         *pgoff += (get_start_sect(lc->dev->bdev) >> PAGE_SECTORS_SHIFT);
887         return lc->dev->dax_dev;
888 }
889
890 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
891                 long nr_pages, enum dax_access_mode mode, void **kaddr,
892                 pfn_t *pfn)
893 {
894         struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
895
896         return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn);
897 }
898
899 static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
900                                           size_t nr_pages)
901 {
902         struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
903
904         return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT);
905 }
906
907 static size_t log_writes_dax_recovery_write(struct dm_target *ti,
908                 pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
909 {
910         struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
911
912         return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
913 }
914
915 #else
916 #define log_writes_dax_direct_access NULL
917 #define log_writes_dax_zero_page_range NULL
918 #define log_writes_dax_recovery_write NULL
919 #endif
920
921 static struct target_type log_writes_target = {
922         .name   = "log-writes",
923         .version = {1, 1, 0},
924         .module = THIS_MODULE,
925         .ctr    = log_writes_ctr,
926         .dtr    = log_writes_dtr,
927         .map    = log_writes_map,
928         .end_io = normal_end_io,
929         .status = log_writes_status,
930         .prepare_ioctl = log_writes_prepare_ioctl,
931         .message = log_writes_message,
932         .iterate_devices = log_writes_iterate_devices,
933         .io_hints = log_writes_io_hints,
934         .direct_access = log_writes_dax_direct_access,
935         .dax_zero_page_range = log_writes_dax_zero_page_range,
936         .dax_recovery_write = log_writes_dax_recovery_write,
937 };
938
939 static int __init dm_log_writes_init(void)
940 {
941         int r = dm_register_target(&log_writes_target);
942
943         if (r < 0)
944                 DMERR("register failed %d", r);
945
946         return r;
947 }
948
949 static void __exit dm_log_writes_exit(void)
950 {
951         dm_unregister_target(&log_writes_target);
952 }
953
954 module_init(dm_log_writes_init);
955 module_exit(dm_log_writes_exit);
956
957 MODULE_DESCRIPTION(DM_NAME " log writes target");
958 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
959 MODULE_LICENSE("GPL");