LoongArch: Parse MADT to get multi-processor information
[linux-2.6-microblaze.git] / drivers / md / dm-snap.c
1 /*
2  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include <linux/blkdev.h>
8 #include <linux/device-mapper.h>
9 #include <linux/delay.h>
10 #include <linux/fs.h>
11 #include <linux/init.h>
12 #include <linux/kdev_t.h>
13 #include <linux/list.h>
14 #include <linux/list_bl.h>
15 #include <linux/mempool.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 #include <linux/vmalloc.h>
19 #include <linux/log2.h>
20 #include <linux/dm-kcopyd.h>
21
22 #include "dm.h"
23
24 #include "dm-exception-store.h"
25
26 #define DM_MSG_PREFIX "snapshots"
27
28 static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
29
30 #define dm_target_is_snapshot_merge(ti) \
31         ((ti)->type->name == dm_snapshot_merge_target_name)
32
33 /*
34  * The size of the mempool used to track chunks in use.
35  */
36 #define MIN_IOS 256
37
38 #define DM_TRACKED_CHUNK_HASH_SIZE      16
39 #define DM_TRACKED_CHUNK_HASH(x)        ((unsigned long)(x) & \
40                                          (DM_TRACKED_CHUNK_HASH_SIZE - 1))
41
42 struct dm_exception_table {
43         uint32_t hash_mask;
44         unsigned hash_shift;
45         struct hlist_bl_head *table;
46 };
47
48 struct dm_snapshot {
49         struct rw_semaphore lock;
50
51         struct dm_dev *origin;
52         struct dm_dev *cow;
53
54         struct dm_target *ti;
55
56         /* List of snapshots per Origin */
57         struct list_head list;
58
59         /*
60          * You can't use a snapshot if this is 0 (e.g. if full).
61          * A snapshot-merge target never clears this.
62          */
63         int valid;
64
65         /*
66          * The snapshot overflowed because of a write to the snapshot device.
67          * We don't have to invalidate the snapshot in this case, but we need
68          * to prevent further writes.
69          */
70         int snapshot_overflowed;
71
72         /* Origin writes don't trigger exceptions until this is set */
73         int active;
74
75         atomic_t pending_exceptions_count;
76
77         spinlock_t pe_allocation_lock;
78
79         /* Protected by "pe_allocation_lock" */
80         sector_t exception_start_sequence;
81
82         /* Protected by kcopyd single-threaded callback */
83         sector_t exception_complete_sequence;
84
85         /*
86          * A list of pending exceptions that completed out of order.
87          * Protected by kcopyd single-threaded callback.
88          */
89         struct rb_root out_of_order_tree;
90
91         mempool_t pending_pool;
92
93         struct dm_exception_table pending;
94         struct dm_exception_table complete;
95
96         /*
97          * pe_lock protects all pending_exception operations and access
98          * as well as the snapshot_bios list.
99          */
100         spinlock_t pe_lock;
101
102         /* Chunks with outstanding reads */
103         spinlock_t tracked_chunk_lock;
104         struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
105
106         /* The on disk metadata handler */
107         struct dm_exception_store *store;
108
109         unsigned in_progress;
110         struct wait_queue_head in_progress_wait;
111
112         struct dm_kcopyd_client *kcopyd_client;
113
114         /* Wait for events based on state_bits */
115         unsigned long state_bits;
116
117         /* Range of chunks currently being merged. */
118         chunk_t first_merging_chunk;
119         int num_merging_chunks;
120
121         /*
122          * The merge operation failed if this flag is set.
123          * Failure modes are handled as follows:
124          * - I/O error reading the header
125          *      => don't load the target; abort.
126          * - Header does not have "valid" flag set
127          *      => use the origin; forget about the snapshot.
128          * - I/O error when reading exceptions
129          *      => don't load the target; abort.
130          *         (We can't use the intermediate origin state.)
131          * - I/O error while merging
132          *      => stop merging; set merge_failed; process I/O normally.
133          */
134         bool merge_failed:1;
135
136         bool discard_zeroes_cow:1;
137         bool discard_passdown_origin:1;
138
139         /*
140          * Incoming bios that overlap with chunks being merged must wait
141          * for them to be committed.
142          */
143         struct bio_list bios_queued_during_merge;
144 };
145
146 /*
147  * state_bits:
148  *   RUNNING_MERGE  - Merge operation is in progress.
149  *   SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
150  *                    cleared afterwards.
151  */
152 #define RUNNING_MERGE          0
153 #define SHUTDOWN_MERGE         1
154
155 /*
156  * Maximum number of chunks being copied on write.
157  *
158  * The value was decided experimentally as a trade-off between memory
159  * consumption, stalling the kernel's workqueues and maintaining a high enough
160  * throughput.
161  */
162 #define DEFAULT_COW_THRESHOLD 2048
163
164 static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
165 module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
166 MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
167
168 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
169                 "A percentage of time allocated for copy on write");
170
171 struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
172 {
173         return s->origin;
174 }
175 EXPORT_SYMBOL(dm_snap_origin);
176
177 struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
178 {
179         return s->cow;
180 }
181 EXPORT_SYMBOL(dm_snap_cow);
182
183 static sector_t chunk_to_sector(struct dm_exception_store *store,
184                                 chunk_t chunk)
185 {
186         return chunk << store->chunk_shift;
187 }
188
189 static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
190 {
191         /*
192          * There is only ever one instance of a particular block
193          * device so we can compare pointers safely.
194          */
195         return lhs == rhs;
196 }
197
198 struct dm_snap_pending_exception {
199         struct dm_exception e;
200
201         /*
202          * Origin buffers waiting for this to complete are held
203          * in a bio list
204          */
205         struct bio_list origin_bios;
206         struct bio_list snapshot_bios;
207
208         /* Pointer back to snapshot context */
209         struct dm_snapshot *snap;
210
211         /*
212          * 1 indicates the exception has already been sent to
213          * kcopyd.
214          */
215         int started;
216
217         /* There was copying error. */
218         int copy_error;
219
220         /* A sequence number, it is used for in-order completion. */
221         sector_t exception_sequence;
222
223         struct rb_node out_of_order_node;
224
225         /*
226          * For writing a complete chunk, bypassing the copy.
227          */
228         struct bio *full_bio;
229         bio_end_io_t *full_bio_end_io;
230 };
231
232 /*
233  * Hash table mapping origin volumes to lists of snapshots and
234  * a lock to protect it
235  */
236 static struct kmem_cache *exception_cache;
237 static struct kmem_cache *pending_cache;
238
239 struct dm_snap_tracked_chunk {
240         struct hlist_node node;
241         chunk_t chunk;
242 };
243
244 static void init_tracked_chunk(struct bio *bio)
245 {
246         struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
247         INIT_HLIST_NODE(&c->node);
248 }
249
250 static bool is_bio_tracked(struct bio *bio)
251 {
252         struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
253         return !hlist_unhashed(&c->node);
254 }
255
256 static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
257 {
258         struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
259
260         c->chunk = chunk;
261
262         spin_lock_irq(&s->tracked_chunk_lock);
263         hlist_add_head(&c->node,
264                        &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
265         spin_unlock_irq(&s->tracked_chunk_lock);
266 }
267
268 static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
269 {
270         struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
271         unsigned long flags;
272
273         spin_lock_irqsave(&s->tracked_chunk_lock, flags);
274         hlist_del(&c->node);
275         spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
276 }
277
278 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
279 {
280         struct dm_snap_tracked_chunk *c;
281         int found = 0;
282
283         spin_lock_irq(&s->tracked_chunk_lock);
284
285         hlist_for_each_entry(c,
286             &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
287                 if (c->chunk == chunk) {
288                         found = 1;
289                         break;
290                 }
291         }
292
293         spin_unlock_irq(&s->tracked_chunk_lock);
294
295         return found;
296 }
297
298 /*
299  * This conflicting I/O is extremely improbable in the caller,
300  * so msleep(1) is sufficient and there is no need for a wait queue.
301  */
302 static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
303 {
304         while (__chunk_is_tracked(s, chunk))
305                 msleep(1);
306 }
307
308 /*
309  * One of these per registered origin, held in the snapshot_origins hash
310  */
311 struct origin {
312         /* The origin device */
313         struct block_device *bdev;
314
315         struct list_head hash_list;
316
317         /* List of snapshots for this origin */
318         struct list_head snapshots;
319 };
320
321 /*
322  * This structure is allocated for each origin target
323  */
324 struct dm_origin {
325         struct dm_dev *dev;
326         struct dm_target *ti;
327         unsigned split_boundary;
328         struct list_head hash_list;
329 };
330
331 /*
332  * Size of the hash table for origin volumes. If we make this
333  * the size of the minors list then it should be nearly perfect
334  */
335 #define ORIGIN_HASH_SIZE 256
336 #define ORIGIN_MASK      0xFF
337 static struct list_head *_origins;
338 static struct list_head *_dm_origins;
339 static struct rw_semaphore _origins_lock;
340
341 static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
342 static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
343 static uint64_t _pending_exceptions_done_count;
344
345 static int init_origin_hash(void)
346 {
347         int i;
348
349         _origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head),
350                                  GFP_KERNEL);
351         if (!_origins) {
352                 DMERR("unable to allocate memory for _origins");
353                 return -ENOMEM;
354         }
355         for (i = 0; i < ORIGIN_HASH_SIZE; i++)
356                 INIT_LIST_HEAD(_origins + i);
357
358         _dm_origins = kmalloc_array(ORIGIN_HASH_SIZE,
359                                     sizeof(struct list_head),
360                                     GFP_KERNEL);
361         if (!_dm_origins) {
362                 DMERR("unable to allocate memory for _dm_origins");
363                 kfree(_origins);
364                 return -ENOMEM;
365         }
366         for (i = 0; i < ORIGIN_HASH_SIZE; i++)
367                 INIT_LIST_HEAD(_dm_origins + i);
368
369         init_rwsem(&_origins_lock);
370
371         return 0;
372 }
373
374 static void exit_origin_hash(void)
375 {
376         kfree(_origins);
377         kfree(_dm_origins);
378 }
379
380 static unsigned origin_hash(struct block_device *bdev)
381 {
382         return bdev->bd_dev & ORIGIN_MASK;
383 }
384
385 static struct origin *__lookup_origin(struct block_device *origin)
386 {
387         struct list_head *ol;
388         struct origin *o;
389
390         ol = &_origins[origin_hash(origin)];
391         list_for_each_entry (o, ol, hash_list)
392                 if (bdev_equal(o->bdev, origin))
393                         return o;
394
395         return NULL;
396 }
397
398 static void __insert_origin(struct origin *o)
399 {
400         struct list_head *sl = &_origins[origin_hash(o->bdev)];
401         list_add_tail(&o->hash_list, sl);
402 }
403
404 static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
405 {
406         struct list_head *ol;
407         struct dm_origin *o;
408
409         ol = &_dm_origins[origin_hash(origin)];
410         list_for_each_entry (o, ol, hash_list)
411                 if (bdev_equal(o->dev->bdev, origin))
412                         return o;
413
414         return NULL;
415 }
416
417 static void __insert_dm_origin(struct dm_origin *o)
418 {
419         struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
420         list_add_tail(&o->hash_list, sl);
421 }
422
423 static void __remove_dm_origin(struct dm_origin *o)
424 {
425         list_del(&o->hash_list);
426 }
427
428 /*
429  * _origins_lock must be held when calling this function.
430  * Returns number of snapshots registered using the supplied cow device, plus:
431  * snap_src - a snapshot suitable for use as a source of exception handover
432  * snap_dest - a snapshot capable of receiving exception handover.
433  * snap_merge - an existing snapshot-merge target linked to the same origin.
434  *   There can be at most one snapshot-merge target. The parameter is optional.
435  *
436  * Possible return values and states of snap_src and snap_dest.
437  *   0: NULL, NULL  - first new snapshot
438  *   1: snap_src, NULL - normal snapshot
439  *   2: snap_src, snap_dest  - waiting for handover
440  *   2: snap_src, NULL - handed over, waiting for old to be deleted
441  *   1: NULL, snap_dest - source got destroyed without handover
442  */
443 static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
444                                         struct dm_snapshot **snap_src,
445                                         struct dm_snapshot **snap_dest,
446                                         struct dm_snapshot **snap_merge)
447 {
448         struct dm_snapshot *s;
449         struct origin *o;
450         int count = 0;
451         int active;
452
453         o = __lookup_origin(snap->origin->bdev);
454         if (!o)
455                 goto out;
456
457         list_for_each_entry(s, &o->snapshots, list) {
458                 if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
459                         *snap_merge = s;
460                 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
461                         continue;
462
463                 down_read(&s->lock);
464                 active = s->active;
465                 up_read(&s->lock);
466
467                 if (active) {
468                         if (snap_src)
469                                 *snap_src = s;
470                 } else if (snap_dest)
471                         *snap_dest = s;
472
473                 count++;
474         }
475
476 out:
477         return count;
478 }
479
480 /*
481  * On success, returns 1 if this snapshot is a handover destination,
482  * otherwise returns 0.
483  */
484 static int __validate_exception_handover(struct dm_snapshot *snap)
485 {
486         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
487         struct dm_snapshot *snap_merge = NULL;
488
489         /* Does snapshot need exceptions handed over to it? */
490         if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
491                                           &snap_merge) == 2) ||
492             snap_dest) {
493                 snap->ti->error = "Snapshot cow pairing for exception "
494                                   "table handover failed";
495                 return -EINVAL;
496         }
497
498         /*
499          * If no snap_src was found, snap cannot become a handover
500          * destination.
501          */
502         if (!snap_src)
503                 return 0;
504
505         /*
506          * Non-snapshot-merge handover?
507          */
508         if (!dm_target_is_snapshot_merge(snap->ti))
509                 return 1;
510
511         /*
512          * Do not allow more than one merging snapshot.
513          */
514         if (snap_merge) {
515                 snap->ti->error = "A snapshot is already merging.";
516                 return -EINVAL;
517         }
518
519         if (!snap_src->store->type->prepare_merge ||
520             !snap_src->store->type->commit_merge) {
521                 snap->ti->error = "Snapshot exception store does not "
522                                   "support snapshot-merge.";
523                 return -EINVAL;
524         }
525
526         return 1;
527 }
528
529 static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
530 {
531         struct dm_snapshot *l;
532
533         /* Sort the list according to chunk size, largest-first smallest-last */
534         list_for_each_entry(l, &o->snapshots, list)
535                 if (l->store->chunk_size < s->store->chunk_size)
536                         break;
537         list_add_tail(&s->list, &l->list);
538 }
539
540 /*
541  * Make a note of the snapshot and its origin so we can look it
542  * up when the origin has a write on it.
543  *
544  * Also validate snapshot exception store handovers.
545  * On success, returns 1 if this registration is a handover destination,
546  * otherwise returns 0.
547  */
548 static int register_snapshot(struct dm_snapshot *snap)
549 {
550         struct origin *o, *new_o = NULL;
551         struct block_device *bdev = snap->origin->bdev;
552         int r = 0;
553
554         new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
555         if (!new_o)
556                 return -ENOMEM;
557
558         down_write(&_origins_lock);
559
560         r = __validate_exception_handover(snap);
561         if (r < 0) {
562                 kfree(new_o);
563                 goto out;
564         }
565
566         o = __lookup_origin(bdev);
567         if (o)
568                 kfree(new_o);
569         else {
570                 /* New origin */
571                 o = new_o;
572
573                 /* Initialise the struct */
574                 INIT_LIST_HEAD(&o->snapshots);
575                 o->bdev = bdev;
576
577                 __insert_origin(o);
578         }
579
580         __insert_snapshot(o, snap);
581
582 out:
583         up_write(&_origins_lock);
584
585         return r;
586 }
587
588 /*
589  * Move snapshot to correct place in list according to chunk size.
590  */
591 static void reregister_snapshot(struct dm_snapshot *s)
592 {
593         struct block_device *bdev = s->origin->bdev;
594
595         down_write(&_origins_lock);
596
597         list_del(&s->list);
598         __insert_snapshot(__lookup_origin(bdev), s);
599
600         up_write(&_origins_lock);
601 }
602
603 static void unregister_snapshot(struct dm_snapshot *s)
604 {
605         struct origin *o;
606
607         down_write(&_origins_lock);
608         o = __lookup_origin(s->origin->bdev);
609
610         list_del(&s->list);
611         if (o && list_empty(&o->snapshots)) {
612                 list_del(&o->hash_list);
613                 kfree(o);
614         }
615
616         up_write(&_origins_lock);
617 }
618
619 /*
620  * Implementation of the exception hash tables.
621  * The lowest hash_shift bits of the chunk number are ignored, allowing
622  * some consecutive chunks to be grouped together.
623  */
624 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
625
626 /* Lock to protect access to the completed and pending exception hash tables. */
627 struct dm_exception_table_lock {
628         struct hlist_bl_head *complete_slot;
629         struct hlist_bl_head *pending_slot;
630 };
631
632 static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
633                                          struct dm_exception_table_lock *lock)
634 {
635         struct dm_exception_table *complete = &s->complete;
636         struct dm_exception_table *pending = &s->pending;
637
638         lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
639         lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
640 }
641
642 static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
643 {
644         hlist_bl_lock(lock->complete_slot);
645         hlist_bl_lock(lock->pending_slot);
646 }
647
648 static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
649 {
650         hlist_bl_unlock(lock->pending_slot);
651         hlist_bl_unlock(lock->complete_slot);
652 }
653
654 static int dm_exception_table_init(struct dm_exception_table *et,
655                                    uint32_t size, unsigned hash_shift)
656 {
657         unsigned int i;
658
659         et->hash_shift = hash_shift;
660         et->hash_mask = size - 1;
661         et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head),
662                                    GFP_KERNEL);
663         if (!et->table)
664                 return -ENOMEM;
665
666         for (i = 0; i < size; i++)
667                 INIT_HLIST_BL_HEAD(et->table + i);
668
669         return 0;
670 }
671
672 static void dm_exception_table_exit(struct dm_exception_table *et,
673                                     struct kmem_cache *mem)
674 {
675         struct hlist_bl_head *slot;
676         struct dm_exception *ex;
677         struct hlist_bl_node *pos, *n;
678         int i, size;
679
680         size = et->hash_mask + 1;
681         for (i = 0; i < size; i++) {
682                 slot = et->table + i;
683
684                 hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
685                         kmem_cache_free(mem, ex);
686         }
687
688         kvfree(et->table);
689 }
690
691 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
692 {
693         return (chunk >> et->hash_shift) & et->hash_mask;
694 }
695
696 static void dm_remove_exception(struct dm_exception *e)
697 {
698         hlist_bl_del(&e->hash_list);
699 }
700
701 /*
702  * Return the exception data for a sector, or NULL if not
703  * remapped.
704  */
705 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
706                                                 chunk_t chunk)
707 {
708         struct hlist_bl_head *slot;
709         struct hlist_bl_node *pos;
710         struct dm_exception *e;
711
712         slot = &et->table[exception_hash(et, chunk)];
713         hlist_bl_for_each_entry(e, pos, slot, hash_list)
714                 if (chunk >= e->old_chunk &&
715                     chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
716                         return e;
717
718         return NULL;
719 }
720
721 static struct dm_exception *alloc_completed_exception(gfp_t gfp)
722 {
723         struct dm_exception *e;
724
725         e = kmem_cache_alloc(exception_cache, gfp);
726         if (!e && gfp == GFP_NOIO)
727                 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
728
729         return e;
730 }
731
732 static void free_completed_exception(struct dm_exception *e)
733 {
734         kmem_cache_free(exception_cache, e);
735 }
736
737 static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
738 {
739         struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
740                                                              GFP_NOIO);
741
742         atomic_inc(&s->pending_exceptions_count);
743         pe->snap = s;
744
745         return pe;
746 }
747
748 static void free_pending_exception(struct dm_snap_pending_exception *pe)
749 {
750         struct dm_snapshot *s = pe->snap;
751
752         mempool_free(pe, &s->pending_pool);
753         smp_mb__before_atomic();
754         atomic_dec(&s->pending_exceptions_count);
755 }
756
757 static void dm_insert_exception(struct dm_exception_table *eh,
758                                 struct dm_exception *new_e)
759 {
760         struct hlist_bl_head *l;
761         struct hlist_bl_node *pos;
762         struct dm_exception *e = NULL;
763
764         l = &eh->table[exception_hash(eh, new_e->old_chunk)];
765
766         /* Add immediately if this table doesn't support consecutive chunks */
767         if (!eh->hash_shift)
768                 goto out;
769
770         /* List is ordered by old_chunk */
771         hlist_bl_for_each_entry(e, pos, l, hash_list) {
772                 /* Insert after an existing chunk? */
773                 if (new_e->old_chunk == (e->old_chunk +
774                                          dm_consecutive_chunk_count(e) + 1) &&
775                     new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
776                                          dm_consecutive_chunk_count(e) + 1)) {
777                         dm_consecutive_chunk_count_inc(e);
778                         free_completed_exception(new_e);
779                         return;
780                 }
781
782                 /* Insert before an existing chunk? */
783                 if (new_e->old_chunk == (e->old_chunk - 1) &&
784                     new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
785                         dm_consecutive_chunk_count_inc(e);
786                         e->old_chunk--;
787                         e->new_chunk--;
788                         free_completed_exception(new_e);
789                         return;
790                 }
791
792                 if (new_e->old_chunk < e->old_chunk)
793                         break;
794         }
795
796 out:
797         if (!e) {
798                 /*
799                  * Either the table doesn't support consecutive chunks or slot
800                  * l is empty.
801                  */
802                 hlist_bl_add_head(&new_e->hash_list, l);
803         } else if (new_e->old_chunk < e->old_chunk) {
804                 /* Add before an existing exception */
805                 hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
806         } else {
807                 /* Add to l's tail: e is the last exception in this slot */
808                 hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
809         }
810 }
811
812 /*
813  * Callback used by the exception stores to load exceptions when
814  * initialising.
815  */
816 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
817 {
818         struct dm_exception_table_lock lock;
819         struct dm_snapshot *s = context;
820         struct dm_exception *e;
821
822         e = alloc_completed_exception(GFP_KERNEL);
823         if (!e)
824                 return -ENOMEM;
825
826         e->old_chunk = old;
827
828         /* Consecutive_count is implicitly initialised to zero */
829         e->new_chunk = new;
830
831         /*
832          * Although there is no need to lock access to the exception tables
833          * here, if we don't then hlist_bl_add_head(), called by
834          * dm_insert_exception(), will complain about accessing the
835          * corresponding list without locking it first.
836          */
837         dm_exception_table_lock_init(s, old, &lock);
838
839         dm_exception_table_lock(&lock);
840         dm_insert_exception(&s->complete, e);
841         dm_exception_table_unlock(&lock);
842
843         return 0;
844 }
845
846 /*
847  * Return a minimum chunk size of all snapshots that have the specified origin.
848  * Return zero if the origin has no snapshots.
849  */
850 static uint32_t __minimum_chunk_size(struct origin *o)
851 {
852         struct dm_snapshot *snap;
853         unsigned chunk_size = rounddown_pow_of_two(UINT_MAX);
854
855         if (o)
856                 list_for_each_entry(snap, &o->snapshots, list)
857                         chunk_size = min_not_zero(chunk_size,
858                                                   snap->store->chunk_size);
859
860         return (uint32_t) chunk_size;
861 }
862
863 /*
864  * Hard coded magic.
865  */
866 static int calc_max_buckets(void)
867 {
868         /* use a fixed size of 2MB */
869         unsigned long mem = 2 * 1024 * 1024;
870         mem /= sizeof(struct hlist_bl_head);
871
872         return mem;
873 }
874
875 /*
876  * Allocate room for a suitable hash table.
877  */
878 static int init_hash_tables(struct dm_snapshot *s)
879 {
880         sector_t hash_size, cow_dev_size, max_buckets;
881
882         /*
883          * Calculate based on the size of the original volume or
884          * the COW volume...
885          */
886         cow_dev_size = get_dev_size(s->cow->bdev);
887         max_buckets = calc_max_buckets();
888
889         hash_size = cow_dev_size >> s->store->chunk_shift;
890         hash_size = min(hash_size, max_buckets);
891
892         if (hash_size < 64)
893                 hash_size = 64;
894         hash_size = rounddown_pow_of_two(hash_size);
895         if (dm_exception_table_init(&s->complete, hash_size,
896                                     DM_CHUNK_CONSECUTIVE_BITS))
897                 return -ENOMEM;
898
899         /*
900          * Allocate hash table for in-flight exceptions
901          * Make this smaller than the real hash table
902          */
903         hash_size >>= 3;
904         if (hash_size < 64)
905                 hash_size = 64;
906
907         if (dm_exception_table_init(&s->pending, hash_size, 0)) {
908                 dm_exception_table_exit(&s->complete, exception_cache);
909                 return -ENOMEM;
910         }
911
912         return 0;
913 }
914
915 static void merge_shutdown(struct dm_snapshot *s)
916 {
917         clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
918         smp_mb__after_atomic();
919         wake_up_bit(&s->state_bits, RUNNING_MERGE);
920 }
921
922 static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
923 {
924         s->first_merging_chunk = 0;
925         s->num_merging_chunks = 0;
926
927         return bio_list_get(&s->bios_queued_during_merge);
928 }
929
930 /*
931  * Remove one chunk from the index of completed exceptions.
932  */
933 static int __remove_single_exception_chunk(struct dm_snapshot *s,
934                                            chunk_t old_chunk)
935 {
936         struct dm_exception *e;
937
938         e = dm_lookup_exception(&s->complete, old_chunk);
939         if (!e) {
940                 DMERR("Corruption detected: exception for block %llu is "
941                       "on disk but not in memory",
942                       (unsigned long long)old_chunk);
943                 return -EINVAL;
944         }
945
946         /*
947          * If this is the only chunk using this exception, remove exception.
948          */
949         if (!dm_consecutive_chunk_count(e)) {
950                 dm_remove_exception(e);
951                 free_completed_exception(e);
952                 return 0;
953         }
954
955         /*
956          * The chunk may be either at the beginning or the end of a
957          * group of consecutive chunks - never in the middle.  We are
958          * removing chunks in the opposite order to that in which they
959          * were added, so this should always be true.
960          * Decrement the consecutive chunk counter and adjust the
961          * starting point if necessary.
962          */
963         if (old_chunk == e->old_chunk) {
964                 e->old_chunk++;
965                 e->new_chunk++;
966         } else if (old_chunk != e->old_chunk +
967                    dm_consecutive_chunk_count(e)) {
968                 DMERR("Attempt to merge block %llu from the "
969                       "middle of a chunk range [%llu - %llu]",
970                       (unsigned long long)old_chunk,
971                       (unsigned long long)e->old_chunk,
972                       (unsigned long long)
973                       e->old_chunk + dm_consecutive_chunk_count(e));
974                 return -EINVAL;
975         }
976
977         dm_consecutive_chunk_count_dec(e);
978
979         return 0;
980 }
981
982 static void flush_bios(struct bio *bio);
983
984 static int remove_single_exception_chunk(struct dm_snapshot *s)
985 {
986         struct bio *b = NULL;
987         int r;
988         chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
989
990         down_write(&s->lock);
991
992         /*
993          * Process chunks (and associated exceptions) in reverse order
994          * so that dm_consecutive_chunk_count_dec() accounting works.
995          */
996         do {
997                 r = __remove_single_exception_chunk(s, old_chunk);
998                 if (r)
999                         goto out;
1000         } while (old_chunk-- > s->first_merging_chunk);
1001
1002         b = __release_queued_bios_after_merge(s);
1003
1004 out:
1005         up_write(&s->lock);
1006         if (b)
1007                 flush_bios(b);
1008
1009         return r;
1010 }
1011
1012 static int origin_write_extent(struct dm_snapshot *merging_snap,
1013                                sector_t sector, unsigned chunk_size);
1014
1015 static void merge_callback(int read_err, unsigned long write_err,
1016                            void *context);
1017
1018 static uint64_t read_pending_exceptions_done_count(void)
1019 {
1020         uint64_t pending_exceptions_done;
1021
1022         spin_lock(&_pending_exceptions_done_spinlock);
1023         pending_exceptions_done = _pending_exceptions_done_count;
1024         spin_unlock(&_pending_exceptions_done_spinlock);
1025
1026         return pending_exceptions_done;
1027 }
1028
1029 static void increment_pending_exceptions_done_count(void)
1030 {
1031         spin_lock(&_pending_exceptions_done_spinlock);
1032         _pending_exceptions_done_count++;
1033         spin_unlock(&_pending_exceptions_done_spinlock);
1034
1035         wake_up_all(&_pending_exceptions_done);
1036 }
1037
1038 static void snapshot_merge_next_chunks(struct dm_snapshot *s)
1039 {
1040         int i, linear_chunks;
1041         chunk_t old_chunk, new_chunk;
1042         struct dm_io_region src, dest;
1043         sector_t io_size;
1044         uint64_t previous_count;
1045
1046         BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
1047         if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
1048                 goto shut;
1049
1050         /*
1051          * valid flag never changes during merge, so no lock required.
1052          */
1053         if (!s->valid) {
1054                 DMERR("Snapshot is invalid: can't merge");
1055                 goto shut;
1056         }
1057
1058         linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
1059                                                       &new_chunk);
1060         if (linear_chunks <= 0) {
1061                 if (linear_chunks < 0) {
1062                         DMERR("Read error in exception store: "
1063                               "shutting down merge");
1064                         down_write(&s->lock);
1065                         s->merge_failed = true;
1066                         up_write(&s->lock);
1067                 }
1068                 goto shut;
1069         }
1070
1071         /* Adjust old_chunk and new_chunk to reflect start of linear region */
1072         old_chunk = old_chunk + 1 - linear_chunks;
1073         new_chunk = new_chunk + 1 - linear_chunks;
1074
1075         /*
1076          * Use one (potentially large) I/O to copy all 'linear_chunks'
1077          * from the exception store to the origin
1078          */
1079         io_size = linear_chunks * s->store->chunk_size;
1080
1081         dest.bdev = s->origin->bdev;
1082         dest.sector = chunk_to_sector(s->store, old_chunk);
1083         dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
1084
1085         src.bdev = s->cow->bdev;
1086         src.sector = chunk_to_sector(s->store, new_chunk);
1087         src.count = dest.count;
1088
1089         /*
1090          * Reallocate any exceptions needed in other snapshots then
1091          * wait for the pending exceptions to complete.
1092          * Each time any pending exception (globally on the system)
1093          * completes we are woken and repeat the process to find out
1094          * if we can proceed.  While this may not seem a particularly
1095          * efficient algorithm, it is not expected to have any
1096          * significant impact on performance.
1097          */
1098         previous_count = read_pending_exceptions_done_count();
1099         while (origin_write_extent(s, dest.sector, io_size)) {
1100                 wait_event(_pending_exceptions_done,
1101                            (read_pending_exceptions_done_count() !=
1102                             previous_count));
1103                 /* Retry after the wait, until all exceptions are done. */
1104                 previous_count = read_pending_exceptions_done_count();
1105         }
1106
1107         down_write(&s->lock);
1108         s->first_merging_chunk = old_chunk;
1109         s->num_merging_chunks = linear_chunks;
1110         up_write(&s->lock);
1111
1112         /* Wait until writes to all 'linear_chunks' drain */
1113         for (i = 0; i < linear_chunks; i++)
1114                 __check_for_conflicting_io(s, old_chunk + i);
1115
1116         dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
1117         return;
1118
1119 shut:
1120         merge_shutdown(s);
1121 }
1122
1123 static void error_bios(struct bio *bio);
1124
1125 static void merge_callback(int read_err, unsigned long write_err, void *context)
1126 {
1127         struct dm_snapshot *s = context;
1128         struct bio *b = NULL;
1129
1130         if (read_err || write_err) {
1131                 if (read_err)
1132                         DMERR("Read error: shutting down merge.");
1133                 else
1134                         DMERR("Write error: shutting down merge.");
1135                 goto shut;
1136         }
1137
1138         if (blkdev_issue_flush(s->origin->bdev) < 0) {
1139                 DMERR("Flush after merge failed: shutting down merge");
1140                 goto shut;
1141         }
1142
1143         if (s->store->type->commit_merge(s->store,
1144                                          s->num_merging_chunks) < 0) {
1145                 DMERR("Write error in exception store: shutting down merge");
1146                 goto shut;
1147         }
1148
1149         if (remove_single_exception_chunk(s) < 0)
1150                 goto shut;
1151
1152         snapshot_merge_next_chunks(s);
1153
1154         return;
1155
1156 shut:
1157         down_write(&s->lock);
1158         s->merge_failed = true;
1159         b = __release_queued_bios_after_merge(s);
1160         up_write(&s->lock);
1161         error_bios(b);
1162
1163         merge_shutdown(s);
1164 }
1165
1166 static void start_merge(struct dm_snapshot *s)
1167 {
1168         if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
1169                 snapshot_merge_next_chunks(s);
1170 }
1171
1172 /*
1173  * Stop the merging process and wait until it finishes.
1174  */
1175 static void stop_merge(struct dm_snapshot *s)
1176 {
1177         set_bit(SHUTDOWN_MERGE, &s->state_bits);
1178         wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
1179         clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1180 }
1181
1182 static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s,
1183                                    struct dm_target *ti)
1184 {
1185         int r;
1186         unsigned argc;
1187         const char *arg_name;
1188
1189         static const struct dm_arg _args[] = {
1190                 {0, 2, "Invalid number of feature arguments"},
1191         };
1192
1193         /*
1194          * No feature arguments supplied.
1195          */
1196         if (!as->argc)
1197                 return 0;
1198
1199         r = dm_read_arg_group(_args, as, &argc, &ti->error);
1200         if (r)
1201                 return -EINVAL;
1202
1203         while (argc && !r) {
1204                 arg_name = dm_shift_arg(as);
1205                 argc--;
1206
1207                 if (!strcasecmp(arg_name, "discard_zeroes_cow"))
1208                         s->discard_zeroes_cow = true;
1209
1210                 else if (!strcasecmp(arg_name, "discard_passdown_origin"))
1211                         s->discard_passdown_origin = true;
1212
1213                 else {
1214                         ti->error = "Unrecognised feature requested";
1215                         r = -EINVAL;
1216                         break;
1217                 }
1218         }
1219
1220         if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
1221                 /*
1222                  * TODO: really these are disjoint.. but ti->num_discard_bios
1223                  * and dm_bio_get_target_bio_nr() require rigid constraints.
1224                  */
1225                 ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
1226                 r = -EINVAL;
1227         }
1228
1229         return r;
1230 }
1231
1232 /*
1233  * Construct a snapshot mapping:
1234  * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*]
1235  */
1236 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1237 {
1238         struct dm_snapshot *s;
1239         struct dm_arg_set as;
1240         int i;
1241         int r = -EINVAL;
1242         char *origin_path, *cow_path;
1243         dev_t origin_dev, cow_dev;
1244         unsigned args_used, num_flush_bios = 1;
1245         fmode_t origin_mode = FMODE_READ;
1246
1247         if (argc < 4) {
1248                 ti->error = "requires 4 or more arguments";
1249                 r = -EINVAL;
1250                 goto bad;
1251         }
1252
1253         if (dm_target_is_snapshot_merge(ti)) {
1254                 num_flush_bios = 2;
1255                 origin_mode = FMODE_WRITE;
1256         }
1257
1258         s = kzalloc(sizeof(*s), GFP_KERNEL);
1259         if (!s) {
1260                 ti->error = "Cannot allocate private snapshot structure";
1261                 r = -ENOMEM;
1262                 goto bad;
1263         }
1264
1265         as.argc = argc;
1266         as.argv = argv;
1267         dm_consume_args(&as, 4);
1268         r = parse_snapshot_features(&as, s, ti);
1269         if (r)
1270                 goto bad_features;
1271
1272         origin_path = argv[0];
1273         argv++;
1274         argc--;
1275
1276         r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
1277         if (r) {
1278                 ti->error = "Cannot get origin device";
1279                 goto bad_origin;
1280         }
1281         origin_dev = s->origin->bdev->bd_dev;
1282
1283         cow_path = argv[0];
1284         argv++;
1285         argc--;
1286
1287         cow_dev = dm_get_dev_t(cow_path);
1288         if (cow_dev && cow_dev == origin_dev) {
1289                 ti->error = "COW device cannot be the same as origin device";
1290                 r = -EINVAL;
1291                 goto bad_cow;
1292         }
1293
1294         r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
1295         if (r) {
1296                 ti->error = "Cannot get COW device";
1297                 goto bad_cow;
1298         }
1299
1300         r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
1301         if (r) {
1302                 ti->error = "Couldn't create exception store";
1303                 r = -EINVAL;
1304                 goto bad_store;
1305         }
1306
1307         argv += args_used;
1308         argc -= args_used;
1309
1310         s->ti = ti;
1311         s->valid = 1;
1312         s->snapshot_overflowed = 0;
1313         s->active = 0;
1314         atomic_set(&s->pending_exceptions_count, 0);
1315         spin_lock_init(&s->pe_allocation_lock);
1316         s->exception_start_sequence = 0;
1317         s->exception_complete_sequence = 0;
1318         s->out_of_order_tree = RB_ROOT;
1319         init_rwsem(&s->lock);
1320         INIT_LIST_HEAD(&s->list);
1321         spin_lock_init(&s->pe_lock);
1322         s->state_bits = 0;
1323         s->merge_failed = false;
1324         s->first_merging_chunk = 0;
1325         s->num_merging_chunks = 0;
1326         bio_list_init(&s->bios_queued_during_merge);
1327
1328         /* Allocate hash table for COW data */
1329         if (init_hash_tables(s)) {
1330                 ti->error = "Unable to allocate hash table space";
1331                 r = -ENOMEM;
1332                 goto bad_hash_tables;
1333         }
1334
1335         init_waitqueue_head(&s->in_progress_wait);
1336
1337         s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1338         if (IS_ERR(s->kcopyd_client)) {
1339                 r = PTR_ERR(s->kcopyd_client);
1340                 ti->error = "Could not create kcopyd client";
1341                 goto bad_kcopyd;
1342         }
1343
1344         r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
1345         if (r) {
1346                 ti->error = "Could not allocate mempool for pending exceptions";
1347                 goto bad_pending_pool;
1348         }
1349
1350         for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
1351                 INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
1352
1353         spin_lock_init(&s->tracked_chunk_lock);
1354
1355         ti->private = s;
1356         ti->num_flush_bios = num_flush_bios;
1357         if (s->discard_zeroes_cow)
1358                 ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
1359         ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
1360
1361         /* Add snapshot to the list of snapshots for this origin */
1362         /* Exceptions aren't triggered till snapshot_resume() is called */
1363         r = register_snapshot(s);
1364         if (r == -ENOMEM) {
1365                 ti->error = "Snapshot origin struct allocation failed";
1366                 goto bad_load_and_register;
1367         } else if (r < 0) {
1368                 /* invalid handover, register_snapshot has set ti->error */
1369                 goto bad_load_and_register;
1370         }
1371
1372         /*
1373          * Metadata must only be loaded into one table at once, so skip this
1374          * if metadata will be handed over during resume.
1375          * Chunk size will be set during the handover - set it to zero to
1376          * ensure it's ignored.
1377          */
1378         if (r > 0) {
1379                 s->store->chunk_size = 0;
1380                 return 0;
1381         }
1382
1383         r = s->store->type->read_metadata(s->store, dm_add_exception,
1384                                           (void *)s);
1385         if (r < 0) {
1386                 ti->error = "Failed to read snapshot metadata";
1387                 goto bad_read_metadata;
1388         } else if (r > 0) {
1389                 s->valid = 0;
1390                 DMWARN("Snapshot is marked invalid.");
1391         }
1392
1393         if (!s->store->chunk_size) {
1394                 ti->error = "Chunk size not set";
1395                 r = -EINVAL;
1396                 goto bad_read_metadata;
1397         }
1398
1399         r = dm_set_target_max_io_len(ti, s->store->chunk_size);
1400         if (r)
1401                 goto bad_read_metadata;
1402
1403         return 0;
1404
1405 bad_read_metadata:
1406         unregister_snapshot(s);
1407 bad_load_and_register:
1408         mempool_exit(&s->pending_pool);
1409 bad_pending_pool:
1410         dm_kcopyd_client_destroy(s->kcopyd_client);
1411 bad_kcopyd:
1412         dm_exception_table_exit(&s->pending, pending_cache);
1413         dm_exception_table_exit(&s->complete, exception_cache);
1414 bad_hash_tables:
1415         dm_exception_store_destroy(s->store);
1416 bad_store:
1417         dm_put_device(ti, s->cow);
1418 bad_cow:
1419         dm_put_device(ti, s->origin);
1420 bad_origin:
1421 bad_features:
1422         kfree(s);
1423 bad:
1424         return r;
1425 }
1426
1427 static void __free_exceptions(struct dm_snapshot *s)
1428 {
1429         dm_kcopyd_client_destroy(s->kcopyd_client);
1430         s->kcopyd_client = NULL;
1431
1432         dm_exception_table_exit(&s->pending, pending_cache);
1433         dm_exception_table_exit(&s->complete, exception_cache);
1434 }
1435
1436 static void __handover_exceptions(struct dm_snapshot *snap_src,
1437                                   struct dm_snapshot *snap_dest)
1438 {
1439         union {
1440                 struct dm_exception_table table_swap;
1441                 struct dm_exception_store *store_swap;
1442         } u;
1443
1444         /*
1445          * Swap all snapshot context information between the two instances.
1446          */
1447         u.table_swap = snap_dest->complete;
1448         snap_dest->complete = snap_src->complete;
1449         snap_src->complete = u.table_swap;
1450
1451         u.store_swap = snap_dest->store;
1452         snap_dest->store = snap_src->store;
1453         snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
1454         snap_src->store = u.store_swap;
1455
1456         snap_dest->store->snap = snap_dest;
1457         snap_src->store->snap = snap_src;
1458
1459         snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
1460         snap_dest->valid = snap_src->valid;
1461         snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
1462
1463         /*
1464          * Set source invalid to ensure it receives no further I/O.
1465          */
1466         snap_src->valid = 0;
1467 }
1468
1469 static void snapshot_dtr(struct dm_target *ti)
1470 {
1471 #ifdef CONFIG_DM_DEBUG
1472         int i;
1473 #endif
1474         struct dm_snapshot *s = ti->private;
1475         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1476
1477         down_read(&_origins_lock);
1478         /* Check whether exception handover must be cancelled */
1479         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1480         if (snap_src && snap_dest && (s == snap_src)) {
1481                 down_write(&snap_dest->lock);
1482                 snap_dest->valid = 0;
1483                 up_write(&snap_dest->lock);
1484                 DMERR("Cancelling snapshot handover.");
1485         }
1486         up_read(&_origins_lock);
1487
1488         if (dm_target_is_snapshot_merge(ti))
1489                 stop_merge(s);
1490
1491         /* Prevent further origin writes from using this snapshot. */
1492         /* After this returns there can be no new kcopyd jobs. */
1493         unregister_snapshot(s);
1494
1495         while (atomic_read(&s->pending_exceptions_count))
1496                 msleep(1);
1497         /*
1498          * Ensure instructions in mempool_exit aren't reordered
1499          * before atomic_read.
1500          */
1501         smp_mb();
1502
1503 #ifdef CONFIG_DM_DEBUG
1504         for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
1505                 BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
1506 #endif
1507
1508         __free_exceptions(s);
1509
1510         mempool_exit(&s->pending_pool);
1511
1512         dm_exception_store_destroy(s->store);
1513
1514         dm_put_device(ti, s->cow);
1515
1516         dm_put_device(ti, s->origin);
1517
1518         WARN_ON(s->in_progress);
1519
1520         kfree(s);
1521 }
1522
1523 static void account_start_copy(struct dm_snapshot *s)
1524 {
1525         spin_lock(&s->in_progress_wait.lock);
1526         s->in_progress++;
1527         spin_unlock(&s->in_progress_wait.lock);
1528 }
1529
1530 static void account_end_copy(struct dm_snapshot *s)
1531 {
1532         spin_lock(&s->in_progress_wait.lock);
1533         BUG_ON(!s->in_progress);
1534         s->in_progress--;
1535         if (likely(s->in_progress <= cow_threshold) &&
1536             unlikely(waitqueue_active(&s->in_progress_wait)))
1537                 wake_up_locked(&s->in_progress_wait);
1538         spin_unlock(&s->in_progress_wait.lock);
1539 }
1540
1541 static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
1542 {
1543         if (unlikely(s->in_progress > cow_threshold)) {
1544                 spin_lock(&s->in_progress_wait.lock);
1545                 if (likely(s->in_progress > cow_threshold)) {
1546                         /*
1547                          * NOTE: this throttle doesn't account for whether
1548                          * the caller is servicing an IO that will trigger a COW
1549                          * so excess throttling may result for chunks not required
1550                          * to be COW'd.  But if cow_threshold was reached, extra
1551                          * throttling is unlikely to negatively impact performance.
1552                          */
1553                         DECLARE_WAITQUEUE(wait, current);
1554                         __add_wait_queue(&s->in_progress_wait, &wait);
1555                         __set_current_state(TASK_UNINTERRUPTIBLE);
1556                         spin_unlock(&s->in_progress_wait.lock);
1557                         if (unlock_origins)
1558                                 up_read(&_origins_lock);
1559                         io_schedule();
1560                         remove_wait_queue(&s->in_progress_wait, &wait);
1561                         return false;
1562                 }
1563                 spin_unlock(&s->in_progress_wait.lock);
1564         }
1565         return true;
1566 }
1567
1568 /*
1569  * Flush a list of buffers.
1570  */
1571 static void flush_bios(struct bio *bio)
1572 {
1573         struct bio *n;
1574
1575         while (bio) {
1576                 n = bio->bi_next;
1577                 bio->bi_next = NULL;
1578                 submit_bio_noacct(bio);
1579                 bio = n;
1580         }
1581 }
1582
1583 static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
1584
1585 /*
1586  * Flush a list of buffers.
1587  */
1588 static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1589 {
1590         struct bio *n;
1591         int r;
1592
1593         while (bio) {
1594                 n = bio->bi_next;
1595                 bio->bi_next = NULL;
1596                 r = do_origin(s->origin, bio, false);
1597                 if (r == DM_MAPIO_REMAPPED)
1598                         submit_bio_noacct(bio);
1599                 bio = n;
1600         }
1601 }
1602
1603 /*
1604  * Error a list of buffers.
1605  */
1606 static void error_bios(struct bio *bio)
1607 {
1608         struct bio *n;
1609
1610         while (bio) {
1611                 n = bio->bi_next;
1612                 bio->bi_next = NULL;
1613                 bio_io_error(bio);
1614                 bio = n;
1615         }
1616 }
1617
1618 static void __invalidate_snapshot(struct dm_snapshot *s, int err)
1619 {
1620         if (!s->valid)
1621                 return;
1622
1623         if (err == -EIO)
1624                 DMERR("Invalidating snapshot: Error reading/writing.");
1625         else if (err == -ENOMEM)
1626                 DMERR("Invalidating snapshot: Unable to allocate exception.");
1627
1628         if (s->store->type->drop_snapshot)
1629                 s->store->type->drop_snapshot(s->store);
1630
1631         s->valid = 0;
1632
1633         dm_table_event(s->ti->table);
1634 }
1635
1636 static void invalidate_snapshot(struct dm_snapshot *s, int err)
1637 {
1638         down_write(&s->lock);
1639         __invalidate_snapshot(s, err);
1640         up_write(&s->lock);
1641 }
1642
1643 static void pending_complete(void *context, int success)
1644 {
1645         struct dm_snap_pending_exception *pe = context;
1646         struct dm_exception *e;
1647         struct dm_snapshot *s = pe->snap;
1648         struct bio *origin_bios = NULL;
1649         struct bio *snapshot_bios = NULL;
1650         struct bio *full_bio = NULL;
1651         struct dm_exception_table_lock lock;
1652         int error = 0;
1653
1654         dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
1655
1656         if (!success) {
1657                 /* Read/write error - snapshot is unusable */
1658                 invalidate_snapshot(s, -EIO);
1659                 error = 1;
1660
1661                 dm_exception_table_lock(&lock);
1662                 goto out;
1663         }
1664
1665         e = alloc_completed_exception(GFP_NOIO);
1666         if (!e) {
1667                 invalidate_snapshot(s, -ENOMEM);
1668                 error = 1;
1669
1670                 dm_exception_table_lock(&lock);
1671                 goto out;
1672         }
1673         *e = pe->e;
1674
1675         down_read(&s->lock);
1676         dm_exception_table_lock(&lock);
1677         if (!s->valid) {
1678                 up_read(&s->lock);
1679                 free_completed_exception(e);
1680                 error = 1;
1681
1682                 goto out;
1683         }
1684
1685         /*
1686          * Add a proper exception. After inserting the completed exception all
1687          * subsequent snapshot reads to this chunk will be redirected to the
1688          * COW device.  This ensures that we do not starve. Moreover, as long
1689          * as the pending exception exists, neither origin writes nor snapshot
1690          * merging can overwrite the chunk in origin.
1691          */
1692         dm_insert_exception(&s->complete, e);
1693         up_read(&s->lock);
1694
1695         /* Wait for conflicting reads to drain */
1696         if (__chunk_is_tracked(s, pe->e.old_chunk)) {
1697                 dm_exception_table_unlock(&lock);
1698                 __check_for_conflicting_io(s, pe->e.old_chunk);
1699                 dm_exception_table_lock(&lock);
1700         }
1701
1702 out:
1703         /* Remove the in-flight exception from the list */
1704         dm_remove_exception(&pe->e);
1705
1706         dm_exception_table_unlock(&lock);
1707
1708         snapshot_bios = bio_list_get(&pe->snapshot_bios);
1709         origin_bios = bio_list_get(&pe->origin_bios);
1710         full_bio = pe->full_bio;
1711         if (full_bio)
1712                 full_bio->bi_end_io = pe->full_bio_end_io;
1713         increment_pending_exceptions_done_count();
1714
1715         /* Submit any pending write bios */
1716         if (error) {
1717                 if (full_bio)
1718                         bio_io_error(full_bio);
1719                 error_bios(snapshot_bios);
1720         } else {
1721                 if (full_bio)
1722                         bio_endio(full_bio);
1723                 flush_bios(snapshot_bios);
1724         }
1725
1726         retry_origin_bios(s, origin_bios);
1727
1728         free_pending_exception(pe);
1729 }
1730
1731 static void complete_exception(struct dm_snap_pending_exception *pe)
1732 {
1733         struct dm_snapshot *s = pe->snap;
1734
1735         /* Update the metadata if we are persistent */
1736         s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
1737                                          pending_complete, pe);
1738 }
1739
1740 /*
1741  * Called when the copy I/O has finished.  kcopyd actually runs
1742  * this code so don't block.
1743  */
1744 static void copy_callback(int read_err, unsigned long write_err, void *context)
1745 {
1746         struct dm_snap_pending_exception *pe = context;
1747         struct dm_snapshot *s = pe->snap;
1748
1749         pe->copy_error = read_err || write_err;
1750
1751         if (pe->exception_sequence == s->exception_complete_sequence) {
1752                 struct rb_node *next;
1753
1754                 s->exception_complete_sequence++;
1755                 complete_exception(pe);
1756
1757                 next = rb_first(&s->out_of_order_tree);
1758                 while (next) {
1759                         pe = rb_entry(next, struct dm_snap_pending_exception,
1760                                         out_of_order_node);
1761                         if (pe->exception_sequence != s->exception_complete_sequence)
1762                                 break;
1763                         next = rb_next(next);
1764                         s->exception_complete_sequence++;
1765                         rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
1766                         complete_exception(pe);
1767                         cond_resched();
1768                 }
1769         } else {
1770                 struct rb_node *parent = NULL;
1771                 struct rb_node **p = &s->out_of_order_tree.rb_node;
1772                 struct dm_snap_pending_exception *pe2;
1773
1774                 while (*p) {
1775                         pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
1776                         parent = *p;
1777
1778                         BUG_ON(pe->exception_sequence == pe2->exception_sequence);
1779                         if (pe->exception_sequence < pe2->exception_sequence)
1780                                 p = &((*p)->rb_left);
1781                         else
1782                                 p = &((*p)->rb_right);
1783                 }
1784
1785                 rb_link_node(&pe->out_of_order_node, parent, p);
1786                 rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
1787         }
1788         account_end_copy(s);
1789 }
1790
1791 /*
1792  * Dispatches the copy operation to kcopyd.
1793  */
1794 static void start_copy(struct dm_snap_pending_exception *pe)
1795 {
1796         struct dm_snapshot *s = pe->snap;
1797         struct dm_io_region src, dest;
1798         struct block_device *bdev = s->origin->bdev;
1799         sector_t dev_size;
1800
1801         dev_size = get_dev_size(bdev);
1802
1803         src.bdev = bdev;
1804         src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
1805         src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
1806
1807         dest.bdev = s->cow->bdev;
1808         dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
1809         dest.count = src.count;
1810
1811         /* Hand over to kcopyd */
1812         account_start_copy(s);
1813         dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
1814 }
1815
1816 static void full_bio_end_io(struct bio *bio)
1817 {
1818         void *callback_data = bio->bi_private;
1819
1820         dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
1821 }
1822
1823 static void start_full_bio(struct dm_snap_pending_exception *pe,
1824                            struct bio *bio)
1825 {
1826         struct dm_snapshot *s = pe->snap;
1827         void *callback_data;
1828
1829         pe->full_bio = bio;
1830         pe->full_bio_end_io = bio->bi_end_io;
1831
1832         account_start_copy(s);
1833         callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
1834                                                    copy_callback, pe);
1835
1836         bio->bi_end_io = full_bio_end_io;
1837         bio->bi_private = callback_data;
1838
1839         submit_bio_noacct(bio);
1840 }
1841
1842 static struct dm_snap_pending_exception *
1843 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
1844 {
1845         struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
1846
1847         if (!e)
1848                 return NULL;
1849
1850         return container_of(e, struct dm_snap_pending_exception, e);
1851 }
1852
1853 /*
1854  * Inserts a pending exception into the pending table.
1855  *
1856  * NOTE: a write lock must be held on the chunk's pending exception table slot
1857  * before calling this.
1858  */
1859 static struct dm_snap_pending_exception *
1860 __insert_pending_exception(struct dm_snapshot *s,
1861                            struct dm_snap_pending_exception *pe, chunk_t chunk)
1862 {
1863         pe->e.old_chunk = chunk;
1864         bio_list_init(&pe->origin_bios);
1865         bio_list_init(&pe->snapshot_bios);
1866         pe->started = 0;
1867         pe->full_bio = NULL;
1868
1869         spin_lock(&s->pe_allocation_lock);
1870         if (s->store->type->prepare_exception(s->store, &pe->e)) {
1871                 spin_unlock(&s->pe_allocation_lock);
1872                 free_pending_exception(pe);
1873                 return NULL;
1874         }
1875
1876         pe->exception_sequence = s->exception_start_sequence++;
1877         spin_unlock(&s->pe_allocation_lock);
1878
1879         dm_insert_exception(&s->pending, &pe->e);
1880
1881         return pe;
1882 }
1883
1884 /*
1885  * Looks to see if this snapshot already has a pending exception
1886  * for this chunk, otherwise it allocates a new one and inserts
1887  * it into the pending table.
1888  *
1889  * NOTE: a write lock must be held on the chunk's pending exception table slot
1890  * before calling this.
1891  */
1892 static struct dm_snap_pending_exception *
1893 __find_pending_exception(struct dm_snapshot *s,
1894                          struct dm_snap_pending_exception *pe, chunk_t chunk)
1895 {
1896         struct dm_snap_pending_exception *pe2;
1897
1898         pe2 = __lookup_pending_exception(s, chunk);
1899         if (pe2) {
1900                 free_pending_exception(pe);
1901                 return pe2;
1902         }
1903
1904         return __insert_pending_exception(s, pe, chunk);
1905 }
1906
1907 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1908                             struct bio *bio, chunk_t chunk)
1909 {
1910         bio_set_dev(bio, s->cow->bdev);
1911         bio->bi_iter.bi_sector =
1912                 chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
1913                                 (chunk - e->old_chunk)) +
1914                 (bio->bi_iter.bi_sector & s->store->chunk_mask);
1915 }
1916
1917 static void zero_callback(int read_err, unsigned long write_err, void *context)
1918 {
1919         struct bio *bio = context;
1920         struct dm_snapshot *s = bio->bi_private;
1921
1922         account_end_copy(s);
1923         bio->bi_status = write_err ? BLK_STS_IOERR : 0;
1924         bio_endio(bio);
1925 }
1926
1927 static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
1928                            struct bio *bio, chunk_t chunk)
1929 {
1930         struct dm_io_region dest;
1931
1932         dest.bdev = s->cow->bdev;
1933         dest.sector = bio->bi_iter.bi_sector;
1934         dest.count = s->store->chunk_size;
1935
1936         account_start_copy(s);
1937         WARN_ON_ONCE(bio->bi_private);
1938         bio->bi_private = s;
1939         dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
1940 }
1941
1942 static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio)
1943 {
1944         return bio->bi_iter.bi_size ==
1945                 (s->store->chunk_size << SECTOR_SHIFT);
1946 }
1947
1948 static int snapshot_map(struct dm_target *ti, struct bio *bio)
1949 {
1950         struct dm_exception *e;
1951         struct dm_snapshot *s = ti->private;
1952         int r = DM_MAPIO_REMAPPED;
1953         chunk_t chunk;
1954         struct dm_snap_pending_exception *pe = NULL;
1955         struct dm_exception_table_lock lock;
1956
1957         init_tracked_chunk(bio);
1958
1959         if (bio->bi_opf & REQ_PREFLUSH) {
1960                 bio_set_dev(bio, s->cow->bdev);
1961                 return DM_MAPIO_REMAPPED;
1962         }
1963
1964         chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
1965         dm_exception_table_lock_init(s, chunk, &lock);
1966
1967         /* Full snapshots are not usable */
1968         /* To get here the table must be live so s->active is always set. */
1969         if (!s->valid)
1970                 return DM_MAPIO_KILL;
1971
1972         if (bio_data_dir(bio) == WRITE) {
1973                 while (unlikely(!wait_for_in_progress(s, false)))
1974                         ; /* wait_for_in_progress() has slept */
1975         }
1976
1977         down_read(&s->lock);
1978         dm_exception_table_lock(&lock);
1979
1980         if (!s->valid || (unlikely(s->snapshot_overflowed) &&
1981             bio_data_dir(bio) == WRITE)) {
1982                 r = DM_MAPIO_KILL;
1983                 goto out_unlock;
1984         }
1985
1986         if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1987                 if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
1988                         /*
1989                          * passdown discard to origin (without triggering
1990                          * snapshot exceptions via do_origin; doing so would
1991                          * defeat the goal of freeing space in origin that is
1992                          * implied by the "discard_passdown_origin" feature)
1993                          */
1994                         bio_set_dev(bio, s->origin->bdev);
1995                         track_chunk(s, bio, chunk);
1996                         goto out_unlock;
1997                 }
1998                 /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
1999         }
2000
2001         /* If the block is already remapped - use that, else remap it */
2002         e = dm_lookup_exception(&s->complete, chunk);
2003         if (e) {
2004                 remap_exception(s, e, bio, chunk);
2005                 if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
2006                     io_overlaps_chunk(s, bio)) {
2007                         dm_exception_table_unlock(&lock);
2008                         up_read(&s->lock);
2009                         zero_exception(s, e, bio, chunk);
2010                         r = DM_MAPIO_SUBMITTED; /* discard is not issued */
2011                         goto out;
2012                 }
2013                 goto out_unlock;
2014         }
2015
2016         if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
2017                 /*
2018                  * If no exception exists, complete discard immediately
2019                  * otherwise it'll trigger copy-out.
2020                  */
2021                 bio_endio(bio);
2022                 r = DM_MAPIO_SUBMITTED;
2023                 goto out_unlock;
2024         }
2025
2026         /*
2027          * Write to snapshot - higher level takes care of RW/RO
2028          * flags so we should only get this if we are
2029          * writeable.
2030          */
2031         if (bio_data_dir(bio) == WRITE) {
2032                 pe = __lookup_pending_exception(s, chunk);
2033                 if (!pe) {
2034                         dm_exception_table_unlock(&lock);
2035                         pe = alloc_pending_exception(s);
2036                         dm_exception_table_lock(&lock);
2037
2038                         e = dm_lookup_exception(&s->complete, chunk);
2039                         if (e) {
2040                                 free_pending_exception(pe);
2041                                 remap_exception(s, e, bio, chunk);
2042                                 goto out_unlock;
2043                         }
2044
2045                         pe = __find_pending_exception(s, pe, chunk);
2046                         if (!pe) {
2047                                 dm_exception_table_unlock(&lock);
2048                                 up_read(&s->lock);
2049
2050                                 down_write(&s->lock);
2051
2052                                 if (s->store->userspace_supports_overflow) {
2053                                         if (s->valid && !s->snapshot_overflowed) {
2054                                                 s->snapshot_overflowed = 1;
2055                                                 DMERR("Snapshot overflowed: Unable to allocate exception.");
2056                                         }
2057                                 } else
2058                                         __invalidate_snapshot(s, -ENOMEM);
2059                                 up_write(&s->lock);
2060
2061                                 r = DM_MAPIO_KILL;
2062                                 goto out;
2063                         }
2064                 }
2065
2066                 remap_exception(s, &pe->e, bio, chunk);
2067
2068                 r = DM_MAPIO_SUBMITTED;
2069
2070                 if (!pe->started && io_overlaps_chunk(s, bio)) {
2071                         pe->started = 1;
2072
2073                         dm_exception_table_unlock(&lock);
2074                         up_read(&s->lock);
2075
2076                         start_full_bio(pe, bio);
2077                         goto out;
2078                 }
2079
2080                 bio_list_add(&pe->snapshot_bios, bio);
2081
2082                 if (!pe->started) {
2083                         /* this is protected by the exception table lock */
2084                         pe->started = 1;
2085
2086                         dm_exception_table_unlock(&lock);
2087                         up_read(&s->lock);
2088
2089                         start_copy(pe);
2090                         goto out;
2091                 }
2092         } else {
2093                 bio_set_dev(bio, s->origin->bdev);
2094                 track_chunk(s, bio, chunk);
2095         }
2096
2097 out_unlock:
2098         dm_exception_table_unlock(&lock);
2099         up_read(&s->lock);
2100 out:
2101         return r;
2102 }
2103
2104 /*
2105  * A snapshot-merge target behaves like a combination of a snapshot
2106  * target and a snapshot-origin target.  It only generates new
2107  * exceptions in other snapshots and not in the one that is being
2108  * merged.
2109  *
2110  * For each chunk, if there is an existing exception, it is used to
2111  * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
2112  * which in turn might generate exceptions in other snapshots.
2113  * If merging is currently taking place on the chunk in question, the
2114  * I/O is deferred by adding it to s->bios_queued_during_merge.
2115  */
2116 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
2117 {
2118         struct dm_exception *e;
2119         struct dm_snapshot *s = ti->private;
2120         int r = DM_MAPIO_REMAPPED;
2121         chunk_t chunk;
2122
2123         init_tracked_chunk(bio);
2124
2125         if (bio->bi_opf & REQ_PREFLUSH) {
2126                 if (!dm_bio_get_target_bio_nr(bio))
2127                         bio_set_dev(bio, s->origin->bdev);
2128                 else
2129                         bio_set_dev(bio, s->cow->bdev);
2130                 return DM_MAPIO_REMAPPED;
2131         }
2132
2133         if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
2134                 /* Once merging, discards no longer effect change */
2135                 bio_endio(bio);
2136                 return DM_MAPIO_SUBMITTED;
2137         }
2138
2139         chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
2140
2141         down_write(&s->lock);
2142
2143         /* Full merging snapshots are redirected to the origin */
2144         if (!s->valid)
2145                 goto redirect_to_origin;
2146
2147         /* If the block is already remapped - use that */
2148         e = dm_lookup_exception(&s->complete, chunk);
2149         if (e) {
2150                 /* Queue writes overlapping with chunks being merged */
2151                 if (bio_data_dir(bio) == WRITE &&
2152                     chunk >= s->first_merging_chunk &&
2153                     chunk < (s->first_merging_chunk +
2154                              s->num_merging_chunks)) {
2155                         bio_set_dev(bio, s->origin->bdev);
2156                         bio_list_add(&s->bios_queued_during_merge, bio);
2157                         r = DM_MAPIO_SUBMITTED;
2158                         goto out_unlock;
2159                 }
2160
2161                 remap_exception(s, e, bio, chunk);
2162
2163                 if (bio_data_dir(bio) == WRITE)
2164                         track_chunk(s, bio, chunk);
2165                 goto out_unlock;
2166         }
2167
2168 redirect_to_origin:
2169         bio_set_dev(bio, s->origin->bdev);
2170
2171         if (bio_data_dir(bio) == WRITE) {
2172                 up_write(&s->lock);
2173                 return do_origin(s->origin, bio, false);
2174         }
2175
2176 out_unlock:
2177         up_write(&s->lock);
2178
2179         return r;
2180 }
2181
2182 static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
2183                 blk_status_t *error)
2184 {
2185         struct dm_snapshot *s = ti->private;
2186
2187         if (is_bio_tracked(bio))
2188                 stop_tracking_chunk(s, bio);
2189
2190         return DM_ENDIO_DONE;
2191 }
2192
2193 static void snapshot_merge_presuspend(struct dm_target *ti)
2194 {
2195         struct dm_snapshot *s = ti->private;
2196
2197         stop_merge(s);
2198 }
2199
2200 static int snapshot_preresume(struct dm_target *ti)
2201 {
2202         int r = 0;
2203         struct dm_snapshot *s = ti->private;
2204         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
2205
2206         down_read(&_origins_lock);
2207         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
2208         if (snap_src && snap_dest) {
2209                 down_read(&snap_src->lock);
2210                 if (s == snap_src) {
2211                         DMERR("Unable to resume snapshot source until "
2212                               "handover completes.");
2213                         r = -EINVAL;
2214                 } else if (!dm_suspended(snap_src->ti)) {
2215                         DMERR("Unable to perform snapshot handover until "
2216                               "source is suspended.");
2217                         r = -EINVAL;
2218                 }
2219                 up_read(&snap_src->lock);
2220         }
2221         up_read(&_origins_lock);
2222
2223         return r;
2224 }
2225
2226 static void snapshot_resume(struct dm_target *ti)
2227 {
2228         struct dm_snapshot *s = ti->private;
2229         struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
2230         struct dm_origin *o;
2231         struct mapped_device *origin_md = NULL;
2232         bool must_restart_merging = false;
2233
2234         down_read(&_origins_lock);
2235
2236         o = __lookup_dm_origin(s->origin->bdev);
2237         if (o)
2238                 origin_md = dm_table_get_md(o->ti->table);
2239         if (!origin_md) {
2240                 (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
2241                 if (snap_merging)
2242                         origin_md = dm_table_get_md(snap_merging->ti->table);
2243         }
2244         if (origin_md == dm_table_get_md(ti->table))
2245                 origin_md = NULL;
2246         if (origin_md) {
2247                 if (dm_hold(origin_md))
2248                         origin_md = NULL;
2249         }
2250
2251         up_read(&_origins_lock);
2252
2253         if (origin_md) {
2254                 dm_internal_suspend_fast(origin_md);
2255                 if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
2256                         must_restart_merging = true;
2257                         stop_merge(snap_merging);
2258                 }
2259         }
2260
2261         down_read(&_origins_lock);
2262
2263         (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
2264         if (snap_src && snap_dest) {
2265                 down_write(&snap_src->lock);
2266                 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
2267                 __handover_exceptions(snap_src, snap_dest);
2268                 up_write(&snap_dest->lock);
2269                 up_write(&snap_src->lock);
2270         }
2271
2272         up_read(&_origins_lock);
2273
2274         if (origin_md) {
2275                 if (must_restart_merging)
2276                         start_merge(snap_merging);
2277                 dm_internal_resume_fast(origin_md);
2278                 dm_put(origin_md);
2279         }
2280
2281         /* Now we have correct chunk size, reregister */
2282         reregister_snapshot(s);
2283
2284         down_write(&s->lock);
2285         s->active = 1;
2286         up_write(&s->lock);
2287 }
2288
2289 static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
2290 {
2291         uint32_t min_chunksize;
2292
2293         down_read(&_origins_lock);
2294         min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
2295         up_read(&_origins_lock);
2296
2297         return min_chunksize;
2298 }
2299
2300 static void snapshot_merge_resume(struct dm_target *ti)
2301 {
2302         struct dm_snapshot *s = ti->private;
2303
2304         /*
2305          * Handover exceptions from existing snapshot.
2306          */
2307         snapshot_resume(ti);
2308
2309         /*
2310          * snapshot-merge acts as an origin, so set ti->max_io_len
2311          */
2312         ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
2313
2314         start_merge(s);
2315 }
2316
2317 static void snapshot_status(struct dm_target *ti, status_type_t type,
2318                             unsigned status_flags, char *result, unsigned maxlen)
2319 {
2320         unsigned sz = 0;
2321         struct dm_snapshot *snap = ti->private;
2322         unsigned num_features;
2323
2324         switch (type) {
2325         case STATUSTYPE_INFO:
2326
2327                 down_write(&snap->lock);
2328
2329                 if (!snap->valid)
2330                         DMEMIT("Invalid");
2331                 else if (snap->merge_failed)
2332                         DMEMIT("Merge failed");
2333                 else if (snap->snapshot_overflowed)
2334                         DMEMIT("Overflow");
2335                 else {
2336                         if (snap->store->type->usage) {
2337                                 sector_t total_sectors, sectors_allocated,
2338                                          metadata_sectors;
2339                                 snap->store->type->usage(snap->store,
2340                                                          &total_sectors,
2341                                                          &sectors_allocated,
2342                                                          &metadata_sectors);
2343                                 DMEMIT("%llu/%llu %llu",
2344                                        (unsigned long long)sectors_allocated,
2345                                        (unsigned long long)total_sectors,
2346                                        (unsigned long long)metadata_sectors);
2347                         }
2348                         else
2349                                 DMEMIT("Unknown");
2350                 }
2351
2352                 up_write(&snap->lock);
2353
2354                 break;
2355
2356         case STATUSTYPE_TABLE:
2357                 /*
2358                  * kdevname returns a static pointer so we need
2359                  * to make private copies if the output is to
2360                  * make sense.
2361                  */
2362                 DMEMIT("%s %s", snap->origin->name, snap->cow->name);
2363                 sz += snap->store->type->status(snap->store, type, result + sz,
2364                                                 maxlen - sz);
2365                 num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
2366                 if (num_features) {
2367                         DMEMIT(" %u", num_features);
2368                         if (snap->discard_zeroes_cow)
2369                                 DMEMIT(" discard_zeroes_cow");
2370                         if (snap->discard_passdown_origin)
2371                                 DMEMIT(" discard_passdown_origin");
2372                 }
2373                 break;
2374
2375         case STATUSTYPE_IMA:
2376                 DMEMIT_TARGET_NAME_VERSION(ti->type);
2377                 DMEMIT(",snap_origin_name=%s", snap->origin->name);
2378                 DMEMIT(",snap_cow_name=%s", snap->cow->name);
2379                 DMEMIT(",snap_valid=%c", snap->valid ? 'y' : 'n');
2380                 DMEMIT(",snap_merge_failed=%c", snap->merge_failed ? 'y' : 'n');
2381                 DMEMIT(",snapshot_overflowed=%c", snap->snapshot_overflowed ? 'y' : 'n');
2382                 DMEMIT(";");
2383                 break;
2384         }
2385 }
2386
2387 static int snapshot_iterate_devices(struct dm_target *ti,
2388                                     iterate_devices_callout_fn fn, void *data)
2389 {
2390         struct dm_snapshot *snap = ti->private;
2391         int r;
2392
2393         r = fn(ti, snap->origin, 0, ti->len, data);
2394
2395         if (!r)
2396                 r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
2397
2398         return r;
2399 }
2400
2401 static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
2402 {
2403         struct dm_snapshot *snap = ti->private;
2404
2405         if (snap->discard_zeroes_cow) {
2406                 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
2407
2408                 down_read(&_origins_lock);
2409
2410                 (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
2411                 if (snap_src && snap_dest)
2412                         snap = snap_src;
2413
2414                 /* All discards are split on chunk_size boundary */
2415                 limits->discard_granularity = snap->store->chunk_size;
2416                 limits->max_discard_sectors = snap->store->chunk_size;
2417
2418                 up_read(&_origins_lock);
2419         }
2420 }
2421
2422 /*-----------------------------------------------------------------
2423  * Origin methods
2424  *---------------------------------------------------------------*/
2425
2426 /*
2427  * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
2428  * supplied bio was ignored.  The caller may submit it immediately.
2429  * (No remapping actually occurs as the origin is always a direct linear
2430  * map.)
2431  *
2432  * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
2433  * and any supplied bio is added to a list to be submitted once all
2434  * the necessary exceptions exist.
2435  */
2436 static int __origin_write(struct list_head *snapshots, sector_t sector,
2437                           struct bio *bio)
2438 {
2439         int r = DM_MAPIO_REMAPPED;
2440         struct dm_snapshot *snap;
2441         struct dm_exception *e;
2442         struct dm_snap_pending_exception *pe, *pe2;
2443         struct dm_snap_pending_exception *pe_to_start_now = NULL;
2444         struct dm_snap_pending_exception *pe_to_start_last = NULL;
2445         struct dm_exception_table_lock lock;
2446         chunk_t chunk;
2447
2448         /* Do all the snapshots on this origin */
2449         list_for_each_entry (snap, snapshots, list) {
2450                 /*
2451                  * Don't make new exceptions in a merging snapshot
2452                  * because it has effectively been deleted
2453                  */
2454                 if (dm_target_is_snapshot_merge(snap->ti))
2455                         continue;
2456
2457                 /* Nothing to do if writing beyond end of snapshot */
2458                 if (sector >= dm_table_get_size(snap->ti->table))
2459                         continue;
2460
2461                 /*
2462                  * Remember, different snapshots can have
2463                  * different chunk sizes.
2464                  */
2465                 chunk = sector_to_chunk(snap->store, sector);
2466                 dm_exception_table_lock_init(snap, chunk, &lock);
2467
2468                 down_read(&snap->lock);
2469                 dm_exception_table_lock(&lock);
2470
2471                 /* Only deal with valid and active snapshots */
2472                 if (!snap->valid || !snap->active)
2473                         goto next_snapshot;
2474
2475                 pe = __lookup_pending_exception(snap, chunk);
2476                 if (!pe) {
2477                         /*
2478                          * Check exception table to see if block is already
2479                          * remapped in this snapshot and trigger an exception
2480                          * if not.
2481                          */
2482                         e = dm_lookup_exception(&snap->complete, chunk);
2483                         if (e)
2484                                 goto next_snapshot;
2485
2486                         dm_exception_table_unlock(&lock);
2487                         pe = alloc_pending_exception(snap);
2488                         dm_exception_table_lock(&lock);
2489
2490                         pe2 = __lookup_pending_exception(snap, chunk);
2491
2492                         if (!pe2) {
2493                                 e = dm_lookup_exception(&snap->complete, chunk);
2494                                 if (e) {
2495                                         free_pending_exception(pe);
2496                                         goto next_snapshot;
2497                                 }
2498
2499                                 pe = __insert_pending_exception(snap, pe, chunk);
2500                                 if (!pe) {
2501                                         dm_exception_table_unlock(&lock);
2502                                         up_read(&snap->lock);
2503
2504                                         invalidate_snapshot(snap, -ENOMEM);
2505                                         continue;
2506                                 }
2507                         } else {
2508                                 free_pending_exception(pe);
2509                                 pe = pe2;
2510                         }
2511                 }
2512
2513                 r = DM_MAPIO_SUBMITTED;
2514
2515                 /*
2516                  * If an origin bio was supplied, queue it to wait for the
2517                  * completion of this exception, and start this one last,
2518                  * at the end of the function.
2519                  */
2520                 if (bio) {
2521                         bio_list_add(&pe->origin_bios, bio);
2522                         bio = NULL;
2523
2524                         if (!pe->started) {
2525                                 pe->started = 1;
2526                                 pe_to_start_last = pe;
2527                         }
2528                 }
2529
2530                 if (!pe->started) {
2531                         pe->started = 1;
2532                         pe_to_start_now = pe;
2533                 }
2534
2535 next_snapshot:
2536                 dm_exception_table_unlock(&lock);
2537                 up_read(&snap->lock);
2538
2539                 if (pe_to_start_now) {
2540                         start_copy(pe_to_start_now);
2541                         pe_to_start_now = NULL;
2542                 }
2543         }
2544
2545         /*
2546          * Submit the exception against which the bio is queued last,
2547          * to give the other exceptions a head start.
2548          */
2549         if (pe_to_start_last)
2550                 start_copy(pe_to_start_last);
2551
2552         return r;
2553 }
2554
2555 /*
2556  * Called on a write from the origin driver.
2557  */
2558 static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
2559 {
2560         struct origin *o;
2561         int r = DM_MAPIO_REMAPPED;
2562
2563 again:
2564         down_read(&_origins_lock);
2565         o = __lookup_origin(origin->bdev);
2566         if (o) {
2567                 if (limit) {
2568                         struct dm_snapshot *s;
2569                         list_for_each_entry(s, &o->snapshots, list)
2570                                 if (unlikely(!wait_for_in_progress(s, true)))
2571                                         goto again;
2572                 }
2573
2574                 r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
2575         }
2576         up_read(&_origins_lock);
2577
2578         return r;
2579 }
2580
2581 /*
2582  * Trigger exceptions in all non-merging snapshots.
2583  *
2584  * The chunk size of the merging snapshot may be larger than the chunk
2585  * size of some other snapshot so we may need to reallocate multiple
2586  * chunks in other snapshots.
2587  *
2588  * We scan all the overlapping exceptions in the other snapshots.
2589  * Returns 1 if anything was reallocated and must be waited for,
2590  * otherwise returns 0.
2591  *
2592  * size must be a multiple of merging_snap's chunk_size.
2593  */
2594 static int origin_write_extent(struct dm_snapshot *merging_snap,
2595                                sector_t sector, unsigned size)
2596 {
2597         int must_wait = 0;
2598         sector_t n;
2599         struct origin *o;
2600
2601         /*
2602          * The origin's __minimum_chunk_size() got stored in max_io_len
2603          * by snapshot_merge_resume().
2604          */
2605         down_read(&_origins_lock);
2606         o = __lookup_origin(merging_snap->origin->bdev);
2607         for (n = 0; n < size; n += merging_snap->ti->max_io_len)
2608                 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2609                     DM_MAPIO_SUBMITTED)
2610                         must_wait = 1;
2611         up_read(&_origins_lock);
2612
2613         return must_wait;
2614 }
2615
2616 /*
2617  * Origin: maps a linear range of a device, with hooks for snapshotting.
2618  */
2619
2620 /*
2621  * Construct an origin mapping: <dev_path>
2622  * The context for an origin is merely a 'struct dm_dev *'
2623  * pointing to the real device.
2624  */
2625 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2626 {
2627         int r;
2628         struct dm_origin *o;
2629
2630         if (argc != 1) {
2631                 ti->error = "origin: incorrect number of arguments";
2632                 return -EINVAL;
2633         }
2634
2635         o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
2636         if (!o) {
2637                 ti->error = "Cannot allocate private origin structure";
2638                 r = -ENOMEM;
2639                 goto bad_alloc;
2640         }
2641
2642         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
2643         if (r) {
2644                 ti->error = "Cannot get target device";
2645                 goto bad_open;
2646         }
2647
2648         o->ti = ti;
2649         ti->private = o;
2650         ti->num_flush_bios = 1;
2651
2652         return 0;
2653
2654 bad_open:
2655         kfree(o);
2656 bad_alloc:
2657         return r;
2658 }
2659
2660 static void origin_dtr(struct dm_target *ti)
2661 {
2662         struct dm_origin *o = ti->private;
2663
2664         dm_put_device(ti, o->dev);
2665         kfree(o);
2666 }
2667
2668 static int origin_map(struct dm_target *ti, struct bio *bio)
2669 {
2670         struct dm_origin *o = ti->private;
2671         unsigned available_sectors;
2672
2673         bio_set_dev(bio, o->dev->bdev);
2674
2675         if (unlikely(bio->bi_opf & REQ_PREFLUSH))
2676                 return DM_MAPIO_REMAPPED;
2677
2678         if (bio_data_dir(bio) != WRITE)
2679                 return DM_MAPIO_REMAPPED;
2680
2681         available_sectors = o->split_boundary -
2682                 ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
2683
2684         if (bio_sectors(bio) > available_sectors)
2685                 dm_accept_partial_bio(bio, available_sectors);
2686
2687         /* Only tell snapshots if this is a write */
2688         return do_origin(o->dev, bio, true);
2689 }
2690
2691 /*
2692  * Set the target "max_io_len" field to the minimum of all the snapshots'
2693  * chunk sizes.
2694  */
2695 static void origin_resume(struct dm_target *ti)
2696 {
2697         struct dm_origin *o = ti->private;
2698
2699         o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
2700
2701         down_write(&_origins_lock);
2702         __insert_dm_origin(o);
2703         up_write(&_origins_lock);
2704 }
2705
2706 static void origin_postsuspend(struct dm_target *ti)
2707 {
2708         struct dm_origin *o = ti->private;
2709
2710         down_write(&_origins_lock);
2711         __remove_dm_origin(o);
2712         up_write(&_origins_lock);
2713 }
2714
2715 static void origin_status(struct dm_target *ti, status_type_t type,
2716                           unsigned status_flags, char *result, unsigned maxlen)
2717 {
2718         struct dm_origin *o = ti->private;
2719
2720         switch (type) {
2721         case STATUSTYPE_INFO:
2722                 result[0] = '\0';
2723                 break;
2724
2725         case STATUSTYPE_TABLE:
2726                 snprintf(result, maxlen, "%s", o->dev->name);
2727                 break;
2728         case STATUSTYPE_IMA:
2729                 result[0] = '\0';
2730                 break;
2731         }
2732 }
2733
2734 static int origin_iterate_devices(struct dm_target *ti,
2735                                   iterate_devices_callout_fn fn, void *data)
2736 {
2737         struct dm_origin *o = ti->private;
2738
2739         return fn(ti, o->dev, 0, ti->len, data);
2740 }
2741
2742 static struct target_type origin_target = {
2743         .name    = "snapshot-origin",
2744         .version = {1, 9, 0},
2745         .module  = THIS_MODULE,
2746         .ctr     = origin_ctr,
2747         .dtr     = origin_dtr,
2748         .map     = origin_map,
2749         .resume  = origin_resume,
2750         .postsuspend = origin_postsuspend,
2751         .status  = origin_status,
2752         .iterate_devices = origin_iterate_devices,
2753 };
2754
2755 static struct target_type snapshot_target = {
2756         .name    = "snapshot",
2757         .version = {1, 16, 0},
2758         .module  = THIS_MODULE,
2759         .ctr     = snapshot_ctr,
2760         .dtr     = snapshot_dtr,
2761         .map     = snapshot_map,
2762         .end_io  = snapshot_end_io,
2763         .preresume  = snapshot_preresume,
2764         .resume  = snapshot_resume,
2765         .status  = snapshot_status,
2766         .iterate_devices = snapshot_iterate_devices,
2767         .io_hints = snapshot_io_hints,
2768 };
2769
2770 static struct target_type merge_target = {
2771         .name    = dm_snapshot_merge_target_name,
2772         .version = {1, 5, 0},
2773         .module  = THIS_MODULE,
2774         .ctr     = snapshot_ctr,
2775         .dtr     = snapshot_dtr,
2776         .map     = snapshot_merge_map,
2777         .end_io  = snapshot_end_io,
2778         .presuspend = snapshot_merge_presuspend,
2779         .preresume  = snapshot_preresume,
2780         .resume  = snapshot_merge_resume,
2781         .status  = snapshot_status,
2782         .iterate_devices = snapshot_iterate_devices,
2783         .io_hints = snapshot_io_hints,
2784 };
2785
2786 static int __init dm_snapshot_init(void)
2787 {
2788         int r;
2789
2790         r = dm_exception_store_init();
2791         if (r) {
2792                 DMERR("Failed to initialize exception stores");
2793                 return r;
2794         }
2795
2796         r = init_origin_hash();
2797         if (r) {
2798                 DMERR("init_origin_hash failed.");
2799                 goto bad_origin_hash;
2800         }
2801
2802         exception_cache = KMEM_CACHE(dm_exception, 0);
2803         if (!exception_cache) {
2804                 DMERR("Couldn't create exception cache.");
2805                 r = -ENOMEM;
2806                 goto bad_exception_cache;
2807         }
2808
2809         pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
2810         if (!pending_cache) {
2811                 DMERR("Couldn't create pending cache.");
2812                 r = -ENOMEM;
2813                 goto bad_pending_cache;
2814         }
2815
2816         r = dm_register_target(&snapshot_target);
2817         if (r < 0) {
2818                 DMERR("snapshot target register failed %d", r);
2819                 goto bad_register_snapshot_target;
2820         }
2821
2822         r = dm_register_target(&origin_target);
2823         if (r < 0) {
2824                 DMERR("Origin target register failed %d", r);
2825                 goto bad_register_origin_target;
2826         }
2827
2828         r = dm_register_target(&merge_target);
2829         if (r < 0) {
2830                 DMERR("Merge target register failed %d", r);
2831                 goto bad_register_merge_target;
2832         }
2833
2834         return 0;
2835
2836 bad_register_merge_target:
2837         dm_unregister_target(&origin_target);
2838 bad_register_origin_target:
2839         dm_unregister_target(&snapshot_target);
2840 bad_register_snapshot_target:
2841         kmem_cache_destroy(pending_cache);
2842 bad_pending_cache:
2843         kmem_cache_destroy(exception_cache);
2844 bad_exception_cache:
2845         exit_origin_hash();
2846 bad_origin_hash:
2847         dm_exception_store_exit();
2848
2849         return r;
2850 }
2851
2852 static void __exit dm_snapshot_exit(void)
2853 {
2854         dm_unregister_target(&snapshot_target);
2855         dm_unregister_target(&origin_target);
2856         dm_unregister_target(&merge_target);
2857
2858         exit_origin_hash();
2859         kmem_cache_destroy(pending_cache);
2860         kmem_cache_destroy(exception_cache);
2861
2862         dm_exception_store_exit();
2863 }
2864
2865 /* Module hooks */
2866 module_init(dm_snapshot_init);
2867 module_exit(dm_snapshot_exit);
2868
2869 MODULE_DESCRIPTION(DM_NAME " snapshot target");
2870 MODULE_AUTHOR("Joe Thornber");
2871 MODULE_LICENSE("GPL");
2872 MODULE_ALIAS("dm-snapshot-origin");
2873 MODULE_ALIAS("dm-snapshot-merge");