perf/core: Replace zero-length array with flexible-array
[linux-2.6-microblaze.git] / drivers / md / dm-mpath.c
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9
10 #include "dm-rq.h"
11 #include "dm-bio-record.h"
12 #include "dm-path-selector.h"
13 #include "dm-uevent.h"
14
15 #include <linux/blkdev.h>
16 #include <linux/ctype.h>
17 #include <linux/init.h>
18 #include <linux/mempool.h>
19 #include <linux/module.h>
20 #include <linux/pagemap.h>
21 #include <linux/slab.h>
22 #include <linux/time.h>
23 #include <linux/timer.h>
24 #include <linux/workqueue.h>
25 #include <linux/delay.h>
26 #include <scsi/scsi_dh.h>
27 #include <linux/atomic.h>
28 #include <linux/blk-mq.h>
29
30 #define DM_MSG_PREFIX "multipath"
31 #define DM_PG_INIT_DELAY_MSECS 2000
32 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
33 #define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
34
35 static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
36
37 /* Path properties */
38 struct pgpath {
39         struct list_head list;
40
41         struct priority_group *pg;      /* Owning PG */
42         unsigned fail_count;            /* Cumulative failure count */
43
44         struct dm_path path;
45         struct delayed_work activate_path;
46
47         bool is_active:1;               /* Path status */
48 };
49
50 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
51
52 /*
53  * Paths are grouped into Priority Groups and numbered from 1 upwards.
54  * Each has a path selector which controls which path gets used.
55  */
56 struct priority_group {
57         struct list_head list;
58
59         struct multipath *m;            /* Owning multipath instance */
60         struct path_selector ps;
61
62         unsigned pg_num;                /* Reference number */
63         unsigned nr_pgpaths;            /* Number of paths in PG */
64         struct list_head pgpaths;
65
66         bool bypassed:1;                /* Temporarily bypass this PG? */
67 };
68
69 /* Multipath context */
70 struct multipath {
71         unsigned long flags;            /* Multipath state flags */
72
73         spinlock_t lock;
74         enum dm_queue_mode queue_mode;
75
76         struct pgpath *current_pgpath;
77         struct priority_group *current_pg;
78         struct priority_group *next_pg; /* Switch to this PG if set */
79
80         atomic_t nr_valid_paths;        /* Total number of usable paths */
81         unsigned nr_priority_groups;
82         struct list_head priority_groups;
83
84         const char *hw_handler_name;
85         char *hw_handler_params;
86         wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
87         unsigned pg_init_retries;       /* Number of times to retry pg_init */
88         unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
89         atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
90         atomic_t pg_init_count;         /* Number of times pg_init called */
91
92         struct mutex work_mutex;
93         struct work_struct trigger_event;
94         struct dm_target *ti;
95
96         struct work_struct process_queued_bios;
97         struct bio_list queued_bios;
98
99         struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
100 };
101
102 /*
103  * Context information attached to each io we process.
104  */
105 struct dm_mpath_io {
106         struct pgpath *pgpath;
107         size_t nr_bytes;
108 };
109
110 typedef int (*action_fn) (struct pgpath *pgpath);
111
112 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
113 static void trigger_event(struct work_struct *work);
114 static void activate_or_offline_path(struct pgpath *pgpath);
115 static void activate_path_work(struct work_struct *work);
116 static void process_queued_bios(struct work_struct *work);
117 static void queue_if_no_path_timeout_work(struct timer_list *t);
118
119 /*-----------------------------------------------
120  * Multipath state flags.
121  *-----------------------------------------------*/
122
123 #define MPATHF_QUEUE_IO 0                       /* Must we queue all I/O? */
124 #define MPATHF_QUEUE_IF_NO_PATH 1               /* Queue I/O if last path fails? */
125 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2         /* Saved state during suspension */
126 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3     /* If there's already a hw_handler present, don't change it. */
127 #define MPATHF_PG_INIT_DISABLED 4               /* pg_init is not currently allowed */
128 #define MPATHF_PG_INIT_REQUIRED 5               /* pg_init needs calling? */
129 #define MPATHF_PG_INIT_DELAY_RETRY 6            /* Delay pg_init retry? */
130
131 /*-----------------------------------------------
132  * Allocation routines
133  *-----------------------------------------------*/
134
135 static struct pgpath *alloc_pgpath(void)
136 {
137         struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
138
139         if (!pgpath)
140                 return NULL;
141
142         pgpath->is_active = true;
143
144         return pgpath;
145 }
146
147 static void free_pgpath(struct pgpath *pgpath)
148 {
149         kfree(pgpath);
150 }
151
152 static struct priority_group *alloc_priority_group(void)
153 {
154         struct priority_group *pg;
155
156         pg = kzalloc(sizeof(*pg), GFP_KERNEL);
157
158         if (pg)
159                 INIT_LIST_HEAD(&pg->pgpaths);
160
161         return pg;
162 }
163
164 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
165 {
166         struct pgpath *pgpath, *tmp;
167
168         list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
169                 list_del(&pgpath->list);
170                 dm_put_device(ti, pgpath->path.dev);
171                 free_pgpath(pgpath);
172         }
173 }
174
175 static void free_priority_group(struct priority_group *pg,
176                                 struct dm_target *ti)
177 {
178         struct path_selector *ps = &pg->ps;
179
180         if (ps->type) {
181                 ps->type->destroy(ps);
182                 dm_put_path_selector(ps->type);
183         }
184
185         free_pgpaths(&pg->pgpaths, ti);
186         kfree(pg);
187 }
188
189 static struct multipath *alloc_multipath(struct dm_target *ti)
190 {
191         struct multipath *m;
192
193         m = kzalloc(sizeof(*m), GFP_KERNEL);
194         if (m) {
195                 INIT_LIST_HEAD(&m->priority_groups);
196                 spin_lock_init(&m->lock);
197                 atomic_set(&m->nr_valid_paths, 0);
198                 INIT_WORK(&m->trigger_event, trigger_event);
199                 mutex_init(&m->work_mutex);
200
201                 m->queue_mode = DM_TYPE_NONE;
202
203                 m->ti = ti;
204                 ti->private = m;
205
206                 timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
207         }
208
209         return m;
210 }
211
212 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
213 {
214         if (m->queue_mode == DM_TYPE_NONE) {
215                 m->queue_mode = DM_TYPE_REQUEST_BASED;
216         } else if (m->queue_mode == DM_TYPE_BIO_BASED) {
217                 INIT_WORK(&m->process_queued_bios, process_queued_bios);
218                 /*
219                  * bio-based doesn't support any direct scsi_dh management;
220                  * it just discovers if a scsi_dh is attached.
221                  */
222                 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
223         }
224
225         dm_table_set_type(ti->table, m->queue_mode);
226
227         /*
228          * Init fields that are only used when a scsi_dh is attached
229          * - must do this unconditionally (really doesn't hurt non-SCSI uses)
230          */
231         set_bit(MPATHF_QUEUE_IO, &m->flags);
232         atomic_set(&m->pg_init_in_progress, 0);
233         atomic_set(&m->pg_init_count, 0);
234         m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
235         init_waitqueue_head(&m->pg_init_wait);
236
237         return 0;
238 }
239
240 static void free_multipath(struct multipath *m)
241 {
242         struct priority_group *pg, *tmp;
243
244         list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
245                 list_del(&pg->list);
246                 free_priority_group(pg, m->ti);
247         }
248
249         kfree(m->hw_handler_name);
250         kfree(m->hw_handler_params);
251         mutex_destroy(&m->work_mutex);
252         kfree(m);
253 }
254
255 static struct dm_mpath_io *get_mpio(union map_info *info)
256 {
257         return info->ptr;
258 }
259
260 static size_t multipath_per_bio_data_size(void)
261 {
262         return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
263 }
264
265 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
266 {
267         return dm_per_bio_data(bio, multipath_per_bio_data_size());
268 }
269
270 static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio)
271 {
272         /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
273         void *bio_details = mpio + 1;
274         return bio_details;
275 }
276
277 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
278 {
279         struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
280         struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio);
281
282         mpio->nr_bytes = bio->bi_iter.bi_size;
283         mpio->pgpath = NULL;
284         *mpio_p = mpio;
285
286         dm_bio_record(bio_details, bio);
287 }
288
289 /*-----------------------------------------------
290  * Path selection
291  *-----------------------------------------------*/
292
293 static int __pg_init_all_paths(struct multipath *m)
294 {
295         struct pgpath *pgpath;
296         unsigned long pg_init_delay = 0;
297
298         lockdep_assert_held(&m->lock);
299
300         if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
301                 return 0;
302
303         atomic_inc(&m->pg_init_count);
304         clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
305
306         /* Check here to reset pg_init_required */
307         if (!m->current_pg)
308                 return 0;
309
310         if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
311                 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
312                                                  m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
313         list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
314                 /* Skip failed paths */
315                 if (!pgpath->is_active)
316                         continue;
317                 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
318                                        pg_init_delay))
319                         atomic_inc(&m->pg_init_in_progress);
320         }
321         return atomic_read(&m->pg_init_in_progress);
322 }
323
324 static int pg_init_all_paths(struct multipath *m)
325 {
326         int ret;
327         unsigned long flags;
328
329         spin_lock_irqsave(&m->lock, flags);
330         ret = __pg_init_all_paths(m);
331         spin_unlock_irqrestore(&m->lock, flags);
332
333         return ret;
334 }
335
336 static void __switch_pg(struct multipath *m, struct priority_group *pg)
337 {
338         m->current_pg = pg;
339
340         /* Must we initialise the PG first, and queue I/O till it's ready? */
341         if (m->hw_handler_name) {
342                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
343                 set_bit(MPATHF_QUEUE_IO, &m->flags);
344         } else {
345                 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
346                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
347         }
348
349         atomic_set(&m->pg_init_count, 0);
350 }
351
352 static struct pgpath *choose_path_in_pg(struct multipath *m,
353                                         struct priority_group *pg,
354                                         size_t nr_bytes)
355 {
356         unsigned long flags;
357         struct dm_path *path;
358         struct pgpath *pgpath;
359
360         path = pg->ps.type->select_path(&pg->ps, nr_bytes);
361         if (!path)
362                 return ERR_PTR(-ENXIO);
363
364         pgpath = path_to_pgpath(path);
365
366         if (unlikely(READ_ONCE(m->current_pg) != pg)) {
367                 /* Only update current_pgpath if pg changed */
368                 spin_lock_irqsave(&m->lock, flags);
369                 m->current_pgpath = pgpath;
370                 __switch_pg(m, pg);
371                 spin_unlock_irqrestore(&m->lock, flags);
372         }
373
374         return pgpath;
375 }
376
377 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
378 {
379         unsigned long flags;
380         struct priority_group *pg;
381         struct pgpath *pgpath;
382         unsigned bypassed = 1;
383
384         if (!atomic_read(&m->nr_valid_paths)) {
385                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
386                 goto failed;
387         }
388
389         /* Were we instructed to switch PG? */
390         if (READ_ONCE(m->next_pg)) {
391                 spin_lock_irqsave(&m->lock, flags);
392                 pg = m->next_pg;
393                 if (!pg) {
394                         spin_unlock_irqrestore(&m->lock, flags);
395                         goto check_current_pg;
396                 }
397                 m->next_pg = NULL;
398                 spin_unlock_irqrestore(&m->lock, flags);
399                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
400                 if (!IS_ERR_OR_NULL(pgpath))
401                         return pgpath;
402         }
403
404         /* Don't change PG until it has no remaining paths */
405 check_current_pg:
406         pg = READ_ONCE(m->current_pg);
407         if (pg) {
408                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
409                 if (!IS_ERR_OR_NULL(pgpath))
410                         return pgpath;
411         }
412
413         /*
414          * Loop through priority groups until we find a valid path.
415          * First time we skip PGs marked 'bypassed'.
416          * Second time we only try the ones we skipped, but set
417          * pg_init_delay_retry so we do not hammer controllers.
418          */
419         do {
420                 list_for_each_entry(pg, &m->priority_groups, list) {
421                         if (pg->bypassed == !!bypassed)
422                                 continue;
423                         pgpath = choose_path_in_pg(m, pg, nr_bytes);
424                         if (!IS_ERR_OR_NULL(pgpath)) {
425                                 if (!bypassed)
426                                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
427                                 return pgpath;
428                         }
429                 }
430         } while (bypassed--);
431
432 failed:
433         spin_lock_irqsave(&m->lock, flags);
434         m->current_pgpath = NULL;
435         m->current_pg = NULL;
436         spin_unlock_irqrestore(&m->lock, flags);
437
438         return NULL;
439 }
440
441 /*
442  * dm_report_EIO() is a macro instead of a function to make pr_debug()
443  * report the function name and line number of the function from which
444  * it has been invoked.
445  */
446 #define dm_report_EIO(m)                                                \
447 do {                                                                    \
448         struct mapped_device *md = dm_table_get_md((m)->ti->table);     \
449                                                                         \
450         pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
451                  dm_device_name(md),                                    \
452                  test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),        \
453                  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags),  \
454                  dm_noflush_suspending((m)->ti));                       \
455 } while (0)
456
457 /*
458  * Check whether bios must be queued in the device-mapper core rather
459  * than here in the target.
460  *
461  * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold
462  * the same value then we are not between multipath_presuspend()
463  * and multipath_resume() calls and we have no need to check
464  * for the DMF_NOFLUSH_SUSPENDING flag.
465  */
466 static bool __must_push_back(struct multipath *m, unsigned long flags)
467 {
468         return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) !=
469                  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) &&
470                 dm_noflush_suspending(m->ti));
471 }
472
473 /*
474  * Following functions use READ_ONCE to get atomic access to
475  * all m->flags to avoid taking spinlock
476  */
477 static bool must_push_back_rq(struct multipath *m)
478 {
479         unsigned long flags = READ_ONCE(m->flags);
480         return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags);
481 }
482
483 static bool must_push_back_bio(struct multipath *m)
484 {
485         unsigned long flags = READ_ONCE(m->flags);
486         return __must_push_back(m, flags);
487 }
488
489 /*
490  * Map cloned requests (request-based multipath)
491  */
492 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
493                                    union map_info *map_context,
494                                    struct request **__clone)
495 {
496         struct multipath *m = ti->private;
497         size_t nr_bytes = blk_rq_bytes(rq);
498         struct pgpath *pgpath;
499         struct block_device *bdev;
500         struct dm_mpath_io *mpio = get_mpio(map_context);
501         struct request_queue *q;
502         struct request *clone;
503
504         /* Do we need to select a new pgpath? */
505         pgpath = READ_ONCE(m->current_pgpath);
506         if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
507                 pgpath = choose_pgpath(m, nr_bytes);
508
509         if (!pgpath) {
510                 if (must_push_back_rq(m))
511                         return DM_MAPIO_DELAY_REQUEUE;
512                 dm_report_EIO(m);       /* Failed */
513                 return DM_MAPIO_KILL;
514         } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
515                    test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
516                 pg_init_all_paths(m);
517                 return DM_MAPIO_DELAY_REQUEUE;
518         }
519
520         mpio->pgpath = pgpath;
521         mpio->nr_bytes = nr_bytes;
522
523         bdev = pgpath->path.dev->bdev;
524         q = bdev_get_queue(bdev);
525         clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
526                         BLK_MQ_REQ_NOWAIT);
527         if (IS_ERR(clone)) {
528                 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
529                 if (blk_queue_dying(q)) {
530                         atomic_inc(&m->pg_init_in_progress);
531                         activate_or_offline_path(pgpath);
532                         return DM_MAPIO_DELAY_REQUEUE;
533                 }
534
535                 /*
536                  * blk-mq's SCHED_RESTART can cover this requeue, so we
537                  * needn't deal with it by DELAY_REQUEUE. More importantly,
538                  * we have to return DM_MAPIO_REQUEUE so that blk-mq can
539                  * get the queue busy feedback (via BLK_STS_RESOURCE),
540                  * otherwise I/O merging can suffer.
541                  */
542                 return DM_MAPIO_REQUEUE;
543         }
544         clone->bio = clone->biotail = NULL;
545         clone->rq_disk = bdev->bd_disk;
546         clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
547         *__clone = clone;
548
549         if (pgpath->pg->ps.type->start_io)
550                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
551                                               &pgpath->path,
552                                               nr_bytes);
553         return DM_MAPIO_REMAPPED;
554 }
555
556 static void multipath_release_clone(struct request *clone,
557                                     union map_info *map_context)
558 {
559         if (unlikely(map_context)) {
560                 /*
561                  * non-NULL map_context means caller is still map
562                  * method; must undo multipath_clone_and_map()
563                  */
564                 struct dm_mpath_io *mpio = get_mpio(map_context);
565                 struct pgpath *pgpath = mpio->pgpath;
566
567                 if (pgpath && pgpath->pg->ps.type->end_io)
568                         pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
569                                                     &pgpath->path,
570                                                     mpio->nr_bytes);
571         }
572
573         blk_put_request(clone);
574 }
575
576 /*
577  * Map cloned bios (bio-based multipath)
578  */
579
580 static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
581 {
582         struct pgpath *pgpath;
583         unsigned long flags;
584         bool queue_io;
585
586         /* Do we need to select a new pgpath? */
587         pgpath = READ_ONCE(m->current_pgpath);
588         queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
589         if (!pgpath || !queue_io)
590                 pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
591
592         if ((pgpath && queue_io) ||
593             (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
594                 /* Queue for the daemon to resubmit */
595                 spin_lock_irqsave(&m->lock, flags);
596                 bio_list_add(&m->queued_bios, bio);
597                 spin_unlock_irqrestore(&m->lock, flags);
598
599                 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
600                 if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
601                         pg_init_all_paths(m);
602                 else if (!queue_io)
603                         queue_work(kmultipathd, &m->process_queued_bios);
604
605                 return ERR_PTR(-EAGAIN);
606         }
607
608         return pgpath;
609 }
610
611 static int __multipath_map_bio(struct multipath *m, struct bio *bio,
612                                struct dm_mpath_io *mpio)
613 {
614         struct pgpath *pgpath = __map_bio(m, bio);
615
616         if (IS_ERR(pgpath))
617                 return DM_MAPIO_SUBMITTED;
618
619         if (!pgpath) {
620                 if (must_push_back_bio(m))
621                         return DM_MAPIO_REQUEUE;
622                 dm_report_EIO(m);
623                 return DM_MAPIO_KILL;
624         }
625
626         mpio->pgpath = pgpath;
627
628         bio->bi_status = 0;
629         bio_set_dev(bio, pgpath->path.dev->bdev);
630         bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
631
632         if (pgpath->pg->ps.type->start_io)
633                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
634                                               &pgpath->path,
635                                               mpio->nr_bytes);
636         return DM_MAPIO_REMAPPED;
637 }
638
639 static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
640 {
641         struct multipath *m = ti->private;
642         struct dm_mpath_io *mpio = NULL;
643
644         multipath_init_per_bio_data(bio, &mpio);
645         return __multipath_map_bio(m, bio, mpio);
646 }
647
648 static void process_queued_io_list(struct multipath *m)
649 {
650         if (m->queue_mode == DM_TYPE_REQUEST_BASED)
651                 dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
652         else if (m->queue_mode == DM_TYPE_BIO_BASED)
653                 queue_work(kmultipathd, &m->process_queued_bios);
654 }
655
656 static void process_queued_bios(struct work_struct *work)
657 {
658         int r;
659         unsigned long flags;
660         struct bio *bio;
661         struct bio_list bios;
662         struct blk_plug plug;
663         struct multipath *m =
664                 container_of(work, struct multipath, process_queued_bios);
665
666         bio_list_init(&bios);
667
668         spin_lock_irqsave(&m->lock, flags);
669
670         if (bio_list_empty(&m->queued_bios)) {
671                 spin_unlock_irqrestore(&m->lock, flags);
672                 return;
673         }
674
675         bio_list_merge(&bios, &m->queued_bios);
676         bio_list_init(&m->queued_bios);
677
678         spin_unlock_irqrestore(&m->lock, flags);
679
680         blk_start_plug(&plug);
681         while ((bio = bio_list_pop(&bios))) {
682                 struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
683                 dm_bio_restore(get_bio_details_from_mpio(mpio), bio);
684                 r = __multipath_map_bio(m, bio, mpio);
685                 switch (r) {
686                 case DM_MAPIO_KILL:
687                         bio->bi_status = BLK_STS_IOERR;
688                         bio_endio(bio);
689                         break;
690                 case DM_MAPIO_REQUEUE:
691                         bio->bi_status = BLK_STS_DM_REQUEUE;
692                         bio_endio(bio);
693                         break;
694                 case DM_MAPIO_REMAPPED:
695                         generic_make_request(bio);
696                         break;
697                 case DM_MAPIO_SUBMITTED:
698                         break;
699                 default:
700                         WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r);
701                 }
702         }
703         blk_finish_plug(&plug);
704 }
705
706 /*
707  * If we run out of usable paths, should we queue I/O or error it?
708  */
709 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
710                             bool save_old_value)
711 {
712         unsigned long flags;
713
714         spin_lock_irqsave(&m->lock, flags);
715         assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags,
716                    (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
717                    (!save_old_value && queue_if_no_path));
718         assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
719         spin_unlock_irqrestore(&m->lock, flags);
720
721         if (!queue_if_no_path) {
722                 dm_table_run_md_queue_async(m->ti->table);
723                 process_queued_io_list(m);
724         }
725
726         return 0;
727 }
728
729 /*
730  * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
731  * process any queued I/O.
732  */
733 static void queue_if_no_path_timeout_work(struct timer_list *t)
734 {
735         struct multipath *m = from_timer(m, t, nopath_timer);
736         struct mapped_device *md = dm_table_get_md(m->ti->table);
737
738         DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
739         queue_if_no_path(m, false, false);
740 }
741
742 /*
743  * Enable the queue_if_no_path timeout if necessary.
744  * Called with m->lock held.
745  */
746 static void enable_nopath_timeout(struct multipath *m)
747 {
748         unsigned long queue_if_no_path_timeout =
749                 READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
750
751         lockdep_assert_held(&m->lock);
752
753         if (queue_if_no_path_timeout > 0 &&
754             atomic_read(&m->nr_valid_paths) == 0 &&
755             test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
756                 mod_timer(&m->nopath_timer,
757                           jiffies + queue_if_no_path_timeout);
758         }
759 }
760
761 static void disable_nopath_timeout(struct multipath *m)
762 {
763         del_timer_sync(&m->nopath_timer);
764 }
765
766 /*
767  * An event is triggered whenever a path is taken out of use.
768  * Includes path failure and PG bypass.
769  */
770 static void trigger_event(struct work_struct *work)
771 {
772         struct multipath *m =
773                 container_of(work, struct multipath, trigger_event);
774
775         dm_table_event(m->ti->table);
776 }
777
778 /*-----------------------------------------------------------------
779  * Constructor/argument parsing:
780  * <#multipath feature args> [<arg>]*
781  * <#hw_handler args> [hw_handler [<arg>]*]
782  * <#priority groups>
783  * <initial priority group>
784  *     [<selector> <#selector args> [<arg>]*
785  *      <#paths> <#per-path selector args>
786  *         [<path> [<arg>]* ]+ ]+
787  *---------------------------------------------------------------*/
788 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
789                                struct dm_target *ti)
790 {
791         int r;
792         struct path_selector_type *pst;
793         unsigned ps_argc;
794
795         static const struct dm_arg _args[] = {
796                 {0, 1024, "invalid number of path selector args"},
797         };
798
799         pst = dm_get_path_selector(dm_shift_arg(as));
800         if (!pst) {
801                 ti->error = "unknown path selector type";
802                 return -EINVAL;
803         }
804
805         r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
806         if (r) {
807                 dm_put_path_selector(pst);
808                 return -EINVAL;
809         }
810
811         r = pst->create(&pg->ps, ps_argc, as->argv);
812         if (r) {
813                 dm_put_path_selector(pst);
814                 ti->error = "path selector constructor failed";
815                 return r;
816         }
817
818         pg->ps.type = pst;
819         dm_consume_args(as, ps_argc);
820
821         return 0;
822 }
823
824 static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
825                          const char **attached_handler_name, char **error)
826 {
827         struct request_queue *q = bdev_get_queue(bdev);
828         int r;
829
830         if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
831 retain:
832                 if (*attached_handler_name) {
833                         /*
834                          * Clear any hw_handler_params associated with a
835                          * handler that isn't already attached.
836                          */
837                         if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) {
838                                 kfree(m->hw_handler_params);
839                                 m->hw_handler_params = NULL;
840                         }
841
842                         /*
843                          * Reset hw_handler_name to match the attached handler
844                          *
845                          * NB. This modifies the table line to show the actual
846                          * handler instead of the original table passed in.
847                          */
848                         kfree(m->hw_handler_name);
849                         m->hw_handler_name = *attached_handler_name;
850                         *attached_handler_name = NULL;
851                 }
852         }
853
854         if (m->hw_handler_name) {
855                 r = scsi_dh_attach(q, m->hw_handler_name);
856                 if (r == -EBUSY) {
857                         char b[BDEVNAME_SIZE];
858
859                         printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
860                                bdevname(bdev, b));
861                         goto retain;
862                 }
863                 if (r < 0) {
864                         *error = "error attaching hardware handler";
865                         return r;
866                 }
867
868                 if (m->hw_handler_params) {
869                         r = scsi_dh_set_params(q, m->hw_handler_params);
870                         if (r < 0) {
871                                 *error = "unable to set hardware handler parameters";
872                                 return r;
873                         }
874                 }
875         }
876
877         return 0;
878 }
879
880 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
881                                  struct dm_target *ti)
882 {
883         int r;
884         struct pgpath *p;
885         struct multipath *m = ti->private;
886         struct request_queue *q;
887         const char *attached_handler_name = NULL;
888
889         /* we need at least a path arg */
890         if (as->argc < 1) {
891                 ti->error = "no device given";
892                 return ERR_PTR(-EINVAL);
893         }
894
895         p = alloc_pgpath();
896         if (!p)
897                 return ERR_PTR(-ENOMEM);
898
899         r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
900                           &p->path.dev);
901         if (r) {
902                 ti->error = "error getting device";
903                 goto bad;
904         }
905
906         q = bdev_get_queue(p->path.dev->bdev);
907         attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
908         if (attached_handler_name || m->hw_handler_name) {
909                 INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
910                 r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
911                 kfree(attached_handler_name);
912                 if (r) {
913                         dm_put_device(ti, p->path.dev);
914                         goto bad;
915                 }
916         }
917
918         r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
919         if (r) {
920                 dm_put_device(ti, p->path.dev);
921                 goto bad;
922         }
923
924         return p;
925  bad:
926         free_pgpath(p);
927         return ERR_PTR(r);
928 }
929
930 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
931                                                    struct multipath *m)
932 {
933         static const struct dm_arg _args[] = {
934                 {1, 1024, "invalid number of paths"},
935                 {0, 1024, "invalid number of selector args"}
936         };
937
938         int r;
939         unsigned i, nr_selector_args, nr_args;
940         struct priority_group *pg;
941         struct dm_target *ti = m->ti;
942
943         if (as->argc < 2) {
944                 as->argc = 0;
945                 ti->error = "not enough priority group arguments";
946                 return ERR_PTR(-EINVAL);
947         }
948
949         pg = alloc_priority_group();
950         if (!pg) {
951                 ti->error = "couldn't allocate priority group";
952                 return ERR_PTR(-ENOMEM);
953         }
954         pg->m = m;
955
956         r = parse_path_selector(as, pg, ti);
957         if (r)
958                 goto bad;
959
960         /*
961          * read the paths
962          */
963         r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
964         if (r)
965                 goto bad;
966
967         r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
968         if (r)
969                 goto bad;
970
971         nr_args = 1 + nr_selector_args;
972         for (i = 0; i < pg->nr_pgpaths; i++) {
973                 struct pgpath *pgpath;
974                 struct dm_arg_set path_args;
975
976                 if (as->argc < nr_args) {
977                         ti->error = "not enough path parameters";
978                         r = -EINVAL;
979                         goto bad;
980                 }
981
982                 path_args.argc = nr_args;
983                 path_args.argv = as->argv;
984
985                 pgpath = parse_path(&path_args, &pg->ps, ti);
986                 if (IS_ERR(pgpath)) {
987                         r = PTR_ERR(pgpath);
988                         goto bad;
989                 }
990
991                 pgpath->pg = pg;
992                 list_add_tail(&pgpath->list, &pg->pgpaths);
993                 dm_consume_args(as, nr_args);
994         }
995
996         return pg;
997
998  bad:
999         free_priority_group(pg, ti);
1000         return ERR_PTR(r);
1001 }
1002
1003 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
1004 {
1005         unsigned hw_argc;
1006         int ret;
1007         struct dm_target *ti = m->ti;
1008
1009         static const struct dm_arg _args[] = {
1010                 {0, 1024, "invalid number of hardware handler args"},
1011         };
1012
1013         if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
1014                 return -EINVAL;
1015
1016         if (!hw_argc)
1017                 return 0;
1018
1019         if (m->queue_mode == DM_TYPE_BIO_BASED) {
1020                 dm_consume_args(as, hw_argc);
1021                 DMERR("bio-based multipath doesn't allow hardware handler args");
1022                 return 0;
1023         }
1024
1025         m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
1026         if (!m->hw_handler_name)
1027                 return -EINVAL;
1028
1029         if (hw_argc > 1) {
1030                 char *p;
1031                 int i, j, len = 4;
1032
1033                 for (i = 0; i <= hw_argc - 2; i++)
1034                         len += strlen(as->argv[i]) + 1;
1035                 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
1036                 if (!p) {
1037                         ti->error = "memory allocation failed";
1038                         ret = -ENOMEM;
1039                         goto fail;
1040                 }
1041                 j = sprintf(p, "%d", hw_argc - 1);
1042                 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
1043                         j = sprintf(p, "%s", as->argv[i]);
1044         }
1045         dm_consume_args(as, hw_argc - 1);
1046
1047         return 0;
1048 fail:
1049         kfree(m->hw_handler_name);
1050         m->hw_handler_name = NULL;
1051         return ret;
1052 }
1053
1054 static int parse_features(struct dm_arg_set *as, struct multipath *m)
1055 {
1056         int r;
1057         unsigned argc;
1058         struct dm_target *ti = m->ti;
1059         const char *arg_name;
1060
1061         static const struct dm_arg _args[] = {
1062                 {0, 8, "invalid number of feature args"},
1063                 {1, 50, "pg_init_retries must be between 1 and 50"},
1064                 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
1065         };
1066
1067         r = dm_read_arg_group(_args, as, &argc, &ti->error);
1068         if (r)
1069                 return -EINVAL;
1070
1071         if (!argc)
1072                 return 0;
1073
1074         do {
1075                 arg_name = dm_shift_arg(as);
1076                 argc--;
1077
1078                 if (!strcasecmp(arg_name, "queue_if_no_path")) {
1079                         r = queue_if_no_path(m, true, false);
1080                         continue;
1081                 }
1082
1083                 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
1084                         set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
1085                         continue;
1086                 }
1087
1088                 if (!strcasecmp(arg_name, "pg_init_retries") &&
1089                     (argc >= 1)) {
1090                         r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
1091                         argc--;
1092                         continue;
1093                 }
1094
1095                 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
1096                     (argc >= 1)) {
1097                         r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
1098                         argc--;
1099                         continue;
1100                 }
1101
1102                 if (!strcasecmp(arg_name, "queue_mode") &&
1103                     (argc >= 1)) {
1104                         const char *queue_mode_name = dm_shift_arg(as);
1105
1106                         if (!strcasecmp(queue_mode_name, "bio"))
1107                                 m->queue_mode = DM_TYPE_BIO_BASED;
1108                         else if (!strcasecmp(queue_mode_name, "rq") ||
1109                                  !strcasecmp(queue_mode_name, "mq"))
1110                                 m->queue_mode = DM_TYPE_REQUEST_BASED;
1111                         else {
1112                                 ti->error = "Unknown 'queue_mode' requested";
1113                                 r = -EINVAL;
1114                         }
1115                         argc--;
1116                         continue;
1117                 }
1118
1119                 ti->error = "Unrecognised multipath feature request";
1120                 r = -EINVAL;
1121         } while (argc && !r);
1122
1123         return r;
1124 }
1125
1126 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
1127 {
1128         /* target arguments */
1129         static const struct dm_arg _args[] = {
1130                 {0, 1024, "invalid number of priority groups"},
1131                 {0, 1024, "invalid initial priority group number"},
1132         };
1133
1134         int r;
1135         struct multipath *m;
1136         struct dm_arg_set as;
1137         unsigned pg_count = 0;
1138         unsigned next_pg_num;
1139         unsigned long flags;
1140
1141         as.argc = argc;
1142         as.argv = argv;
1143
1144         m = alloc_multipath(ti);
1145         if (!m) {
1146                 ti->error = "can't allocate multipath";
1147                 return -EINVAL;
1148         }
1149
1150         r = parse_features(&as, m);
1151         if (r)
1152                 goto bad;
1153
1154         r = alloc_multipath_stage2(ti, m);
1155         if (r)
1156                 goto bad;
1157
1158         r = parse_hw_handler(&as, m);
1159         if (r)
1160                 goto bad;
1161
1162         r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
1163         if (r)
1164                 goto bad;
1165
1166         r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
1167         if (r)
1168                 goto bad;
1169
1170         if ((!m->nr_priority_groups && next_pg_num) ||
1171             (m->nr_priority_groups && !next_pg_num)) {
1172                 ti->error = "invalid initial priority group";
1173                 r = -EINVAL;
1174                 goto bad;
1175         }
1176
1177         /* parse the priority groups */
1178         while (as.argc) {
1179                 struct priority_group *pg;
1180                 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
1181
1182                 pg = parse_priority_group(&as, m);
1183                 if (IS_ERR(pg)) {
1184                         r = PTR_ERR(pg);
1185                         goto bad;
1186                 }
1187
1188                 nr_valid_paths += pg->nr_pgpaths;
1189                 atomic_set(&m->nr_valid_paths, nr_valid_paths);
1190
1191                 list_add_tail(&pg->list, &m->priority_groups);
1192                 pg_count++;
1193                 pg->pg_num = pg_count;
1194                 if (!--next_pg_num)
1195                         m->next_pg = pg;
1196         }
1197
1198         if (pg_count != m->nr_priority_groups) {
1199                 ti->error = "priority group count mismatch";
1200                 r = -EINVAL;
1201                 goto bad;
1202         }
1203
1204         spin_lock_irqsave(&m->lock, flags);
1205         enable_nopath_timeout(m);
1206         spin_unlock_irqrestore(&m->lock, flags);
1207
1208         ti->num_flush_bios = 1;
1209         ti->num_discard_bios = 1;
1210         ti->num_write_same_bios = 1;
1211         ti->num_write_zeroes_bios = 1;
1212         if (m->queue_mode == DM_TYPE_BIO_BASED)
1213                 ti->per_io_data_size = multipath_per_bio_data_size();
1214         else
1215                 ti->per_io_data_size = sizeof(struct dm_mpath_io);
1216
1217         return 0;
1218
1219  bad:
1220         free_multipath(m);
1221         return r;
1222 }
1223
1224 static void multipath_wait_for_pg_init_completion(struct multipath *m)
1225 {
1226         DEFINE_WAIT(wait);
1227
1228         while (1) {
1229                 prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
1230
1231                 if (!atomic_read(&m->pg_init_in_progress))
1232                         break;
1233
1234                 io_schedule();
1235         }
1236         finish_wait(&m->pg_init_wait, &wait);
1237 }
1238
1239 static void flush_multipath_work(struct multipath *m)
1240 {
1241         if (m->hw_handler_name) {
1242                 set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1243                 smp_mb__after_atomic();
1244
1245                 if (atomic_read(&m->pg_init_in_progress))
1246                         flush_workqueue(kmpath_handlerd);
1247                 multipath_wait_for_pg_init_completion(m);
1248
1249                 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1250                 smp_mb__after_atomic();
1251         }
1252
1253         if (m->queue_mode == DM_TYPE_BIO_BASED)
1254                 flush_work(&m->process_queued_bios);
1255         flush_work(&m->trigger_event);
1256 }
1257
1258 static void multipath_dtr(struct dm_target *ti)
1259 {
1260         struct multipath *m = ti->private;
1261
1262         disable_nopath_timeout(m);
1263         flush_multipath_work(m);
1264         free_multipath(m);
1265 }
1266
1267 /*
1268  * Take a path out of use.
1269  */
1270 static int fail_path(struct pgpath *pgpath)
1271 {
1272         unsigned long flags;
1273         struct multipath *m = pgpath->pg->m;
1274
1275         spin_lock_irqsave(&m->lock, flags);
1276
1277         if (!pgpath->is_active)
1278                 goto out;
1279
1280         DMWARN("Failing path %s.", pgpath->path.dev->name);
1281
1282         pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
1283         pgpath->is_active = false;
1284         pgpath->fail_count++;
1285
1286         atomic_dec(&m->nr_valid_paths);
1287
1288         if (pgpath == m->current_pgpath)
1289                 m->current_pgpath = NULL;
1290
1291         dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1292                        pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
1293
1294         schedule_work(&m->trigger_event);
1295
1296         enable_nopath_timeout(m);
1297
1298 out:
1299         spin_unlock_irqrestore(&m->lock, flags);
1300
1301         return 0;
1302 }
1303
1304 /*
1305  * Reinstate a previously-failed path
1306  */
1307 static int reinstate_path(struct pgpath *pgpath)
1308 {
1309         int r = 0, run_queue = 0;
1310         unsigned long flags;
1311         struct multipath *m = pgpath->pg->m;
1312         unsigned nr_valid_paths;
1313
1314         spin_lock_irqsave(&m->lock, flags);
1315
1316         if (pgpath->is_active)
1317                 goto out;
1318
1319         DMWARN("Reinstating path %s.", pgpath->path.dev->name);
1320
1321         r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1322         if (r)
1323                 goto out;
1324
1325         pgpath->is_active = true;
1326
1327         nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
1328         if (nr_valid_paths == 1) {
1329                 m->current_pgpath = NULL;
1330                 run_queue = 1;
1331         } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1332                 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1333                         atomic_inc(&m->pg_init_in_progress);
1334         }
1335
1336         dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1337                        pgpath->path.dev->name, nr_valid_paths);
1338
1339         schedule_work(&m->trigger_event);
1340
1341 out:
1342         spin_unlock_irqrestore(&m->lock, flags);
1343         if (run_queue) {
1344                 dm_table_run_md_queue_async(m->ti->table);
1345                 process_queued_io_list(m);
1346         }
1347
1348         if (pgpath->is_active)
1349                 disable_nopath_timeout(m);
1350
1351         return r;
1352 }
1353
1354 /*
1355  * Fail or reinstate all paths that match the provided struct dm_dev.
1356  */
1357 static int action_dev(struct multipath *m, struct dm_dev *dev,
1358                       action_fn action)
1359 {
1360         int r = -EINVAL;
1361         struct pgpath *pgpath;
1362         struct priority_group *pg;
1363
1364         list_for_each_entry(pg, &m->priority_groups, list) {
1365                 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1366                         if (pgpath->path.dev == dev)
1367                                 r = action(pgpath);
1368                 }
1369         }
1370
1371         return r;
1372 }
1373
1374 /*
1375  * Temporarily try to avoid having to use the specified PG
1376  */
1377 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1378                       bool bypassed)
1379 {
1380         unsigned long flags;
1381
1382         spin_lock_irqsave(&m->lock, flags);
1383
1384         pg->bypassed = bypassed;
1385         m->current_pgpath = NULL;
1386         m->current_pg = NULL;
1387
1388         spin_unlock_irqrestore(&m->lock, flags);
1389
1390         schedule_work(&m->trigger_event);
1391 }
1392
1393 /*
1394  * Switch to using the specified PG from the next I/O that gets mapped
1395  */
1396 static int switch_pg_num(struct multipath *m, const char *pgstr)
1397 {
1398         struct priority_group *pg;
1399         unsigned pgnum;
1400         unsigned long flags;
1401         char dummy;
1402
1403         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1404             !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1405                 DMWARN("invalid PG number supplied to switch_pg_num");
1406                 return -EINVAL;
1407         }
1408
1409         spin_lock_irqsave(&m->lock, flags);
1410         list_for_each_entry(pg, &m->priority_groups, list) {
1411                 pg->bypassed = false;
1412                 if (--pgnum)
1413                         continue;
1414
1415                 m->current_pgpath = NULL;
1416                 m->current_pg = NULL;
1417                 m->next_pg = pg;
1418         }
1419         spin_unlock_irqrestore(&m->lock, flags);
1420
1421         schedule_work(&m->trigger_event);
1422         return 0;
1423 }
1424
1425 /*
1426  * Set/clear bypassed status of a PG.
1427  * PGs are numbered upwards from 1 in the order they were declared.
1428  */
1429 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
1430 {
1431         struct priority_group *pg;
1432         unsigned pgnum;
1433         char dummy;
1434
1435         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1436             !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1437                 DMWARN("invalid PG number supplied to bypass_pg");
1438                 return -EINVAL;
1439         }
1440
1441         list_for_each_entry(pg, &m->priority_groups, list) {
1442                 if (!--pgnum)
1443                         break;
1444         }
1445
1446         bypass_pg(m, pg, bypassed);
1447         return 0;
1448 }
1449
1450 /*
1451  * Should we retry pg_init immediately?
1452  */
1453 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1454 {
1455         unsigned long flags;
1456         bool limit_reached = false;
1457
1458         spin_lock_irqsave(&m->lock, flags);
1459
1460         if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
1461             !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
1462                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
1463         else
1464                 limit_reached = true;
1465
1466         spin_unlock_irqrestore(&m->lock, flags);
1467
1468         return limit_reached;
1469 }
1470
1471 static void pg_init_done(void *data, int errors)
1472 {
1473         struct pgpath *pgpath = data;
1474         struct priority_group *pg = pgpath->pg;
1475         struct multipath *m = pg->m;
1476         unsigned long flags;
1477         bool delay_retry = false;
1478
1479         /* device or driver problems */
1480         switch (errors) {
1481         case SCSI_DH_OK:
1482                 break;
1483         case SCSI_DH_NOSYS:
1484                 if (!m->hw_handler_name) {
1485                         errors = 0;
1486                         break;
1487                 }
1488                 DMERR("Could not failover the device: Handler scsi_dh_%s "
1489                       "Error %d.", m->hw_handler_name, errors);
1490                 /*
1491                  * Fail path for now, so we do not ping pong
1492                  */
1493                 fail_path(pgpath);
1494                 break;
1495         case SCSI_DH_DEV_TEMP_BUSY:
1496                 /*
1497                  * Probably doing something like FW upgrade on the
1498                  * controller so try the other pg.
1499                  */
1500                 bypass_pg(m, pg, true);
1501                 break;
1502         case SCSI_DH_RETRY:
1503                 /* Wait before retrying. */
1504                 delay_retry = true;
1505                 /* fall through */
1506         case SCSI_DH_IMM_RETRY:
1507         case SCSI_DH_RES_TEMP_UNAVAIL:
1508                 if (pg_init_limit_reached(m, pgpath))
1509                         fail_path(pgpath);
1510                 errors = 0;
1511                 break;
1512         case SCSI_DH_DEV_OFFLINED:
1513         default:
1514                 /*
1515                  * We probably do not want to fail the path for a device
1516                  * error, but this is what the old dm did. In future
1517                  * patches we can do more advanced handling.
1518                  */
1519                 fail_path(pgpath);
1520         }
1521
1522         spin_lock_irqsave(&m->lock, flags);
1523         if (errors) {
1524                 if (pgpath == m->current_pgpath) {
1525                         DMERR("Could not failover device. Error %d.", errors);
1526                         m->current_pgpath = NULL;
1527                         m->current_pg = NULL;
1528                 }
1529         } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1530                 pg->bypassed = false;
1531
1532         if (atomic_dec_return(&m->pg_init_in_progress) > 0)
1533                 /* Activations of other paths are still on going */
1534                 goto out;
1535
1536         if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
1537                 if (delay_retry)
1538                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1539                 else
1540                         clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1541
1542                 if (__pg_init_all_paths(m))
1543                         goto out;
1544         }
1545         clear_bit(MPATHF_QUEUE_IO, &m->flags);
1546
1547         process_queued_io_list(m);
1548
1549         /*
1550          * Wake up any thread waiting to suspend.
1551          */
1552         wake_up(&m->pg_init_wait);
1553
1554 out:
1555         spin_unlock_irqrestore(&m->lock, flags);
1556 }
1557
1558 static void activate_or_offline_path(struct pgpath *pgpath)
1559 {
1560         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1561
1562         if (pgpath->is_active && !blk_queue_dying(q))
1563                 scsi_dh_activate(q, pg_init_done, pgpath);
1564         else
1565                 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1566 }
1567
1568 static void activate_path_work(struct work_struct *work)
1569 {
1570         struct pgpath *pgpath =
1571                 container_of(work, struct pgpath, activate_path.work);
1572
1573         activate_or_offline_path(pgpath);
1574 }
1575
1576 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1577                             blk_status_t error, union map_info *map_context)
1578 {
1579         struct dm_mpath_io *mpio = get_mpio(map_context);
1580         struct pgpath *pgpath = mpio->pgpath;
1581         int r = DM_ENDIO_DONE;
1582
1583         /*
1584          * We don't queue any clone request inside the multipath target
1585          * during end I/O handling, since those clone requests don't have
1586          * bio clones.  If we queue them inside the multipath target,
1587          * we need to make bio clones, that requires memory allocation.
1588          * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1589          *  don't have bio clones.)
1590          * Instead of queueing the clone request here, we queue the original
1591          * request into dm core, which will remake a clone request and
1592          * clone bios for it and resubmit it later.
1593          */
1594         if (error && blk_path_error(error)) {
1595                 struct multipath *m = ti->private;
1596
1597                 if (error == BLK_STS_RESOURCE)
1598                         r = DM_ENDIO_DELAY_REQUEUE;
1599                 else
1600                         r = DM_ENDIO_REQUEUE;
1601
1602                 if (pgpath)
1603                         fail_path(pgpath);
1604
1605                 if (atomic_read(&m->nr_valid_paths) == 0 &&
1606                     !must_push_back_rq(m)) {
1607                         if (error == BLK_STS_IOERR)
1608                                 dm_report_EIO(m);
1609                         /* complete with the original error */
1610                         r = DM_ENDIO_DONE;
1611                 }
1612         }
1613
1614         if (pgpath) {
1615                 struct path_selector *ps = &pgpath->pg->ps;
1616
1617                 if (ps->type->end_io)
1618                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1619         }
1620
1621         return r;
1622 }
1623
1624 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
1625                                 blk_status_t *error)
1626 {
1627         struct multipath *m = ti->private;
1628         struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
1629         struct pgpath *pgpath = mpio->pgpath;
1630         unsigned long flags;
1631         int r = DM_ENDIO_DONE;
1632
1633         if (!*error || !blk_path_error(*error))
1634                 goto done;
1635
1636         if (pgpath)
1637                 fail_path(pgpath);
1638
1639         if (atomic_read(&m->nr_valid_paths) == 0 &&
1640             !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1641                 if (must_push_back_bio(m)) {
1642                         r = DM_ENDIO_REQUEUE;
1643                 } else {
1644                         dm_report_EIO(m);
1645                         *error = BLK_STS_IOERR;
1646                 }
1647                 goto done;
1648         }
1649
1650         spin_lock_irqsave(&m->lock, flags);
1651         bio_list_add(&m->queued_bios, clone);
1652         spin_unlock_irqrestore(&m->lock, flags);
1653         if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
1654                 queue_work(kmultipathd, &m->process_queued_bios);
1655
1656         r = DM_ENDIO_INCOMPLETE;
1657 done:
1658         if (pgpath) {
1659                 struct path_selector *ps = &pgpath->pg->ps;
1660
1661                 if (ps->type->end_io)
1662                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1663         }
1664
1665         return r;
1666 }
1667
1668 /*
1669  * Suspend can't complete until all the I/O is processed so if
1670  * the last path fails we must error any remaining I/O.
1671  * Note that if the freeze_bdev fails while suspending, the
1672  * queue_if_no_path state is lost - userspace should reset it.
1673  */
1674 static void multipath_presuspend(struct dm_target *ti)
1675 {
1676         struct multipath *m = ti->private;
1677
1678         queue_if_no_path(m, false, true);
1679 }
1680
1681 static void multipath_postsuspend(struct dm_target *ti)
1682 {
1683         struct multipath *m = ti->private;
1684
1685         mutex_lock(&m->work_mutex);
1686         flush_multipath_work(m);
1687         mutex_unlock(&m->work_mutex);
1688 }
1689
1690 /*
1691  * Restore the queue_if_no_path setting.
1692  */
1693 static void multipath_resume(struct dm_target *ti)
1694 {
1695         struct multipath *m = ti->private;
1696         unsigned long flags;
1697
1698         spin_lock_irqsave(&m->lock, flags);
1699         assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags,
1700                    test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
1701         spin_unlock_irqrestore(&m->lock, flags);
1702 }
1703
1704 /*
1705  * Info output has the following format:
1706  * num_multipath_feature_args [multipath_feature_args]*
1707  * num_handler_status_args [handler_status_args]*
1708  * num_groups init_group_number
1709  *            [A|D|E num_ps_status_args [ps_status_args]*
1710  *             num_paths num_selector_args
1711  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1712  *
1713  * Table output has the following format (identical to the constructor string):
1714  * num_feature_args [features_args]*
1715  * num_handler_args hw_handler [hw_handler_args]*
1716  * num_groups init_group_number
1717  *     [priority selector-name num_ps_args [ps_args]*
1718  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1719  */
1720 static void multipath_status(struct dm_target *ti, status_type_t type,
1721                              unsigned status_flags, char *result, unsigned maxlen)
1722 {
1723         int sz = 0;
1724         unsigned long flags;
1725         struct multipath *m = ti->private;
1726         struct priority_group *pg;
1727         struct pgpath *p;
1728         unsigned pg_num;
1729         char state;
1730
1731         spin_lock_irqsave(&m->lock, flags);
1732
1733         /* Features */
1734         if (type == STATUSTYPE_INFO)
1735                 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
1736                        atomic_read(&m->pg_init_count));
1737         else {
1738                 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
1739                               (m->pg_init_retries > 0) * 2 +
1740                               (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1741                               test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
1742                               (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
1743
1744                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1745                         DMEMIT("queue_if_no_path ");
1746                 if (m->pg_init_retries)
1747                         DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1748                 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1749                         DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1750                 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
1751                         DMEMIT("retain_attached_hw_handler ");
1752                 if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
1753                         switch(m->queue_mode) {
1754                         case DM_TYPE_BIO_BASED:
1755                                 DMEMIT("queue_mode bio ");
1756                                 break;
1757                         default:
1758                                 WARN_ON_ONCE(true);
1759                                 break;
1760                         }
1761                 }
1762         }
1763
1764         if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1765                 DMEMIT("0 ");
1766         else
1767                 DMEMIT("1 %s ", m->hw_handler_name);
1768
1769         DMEMIT("%u ", m->nr_priority_groups);
1770
1771         if (m->next_pg)
1772                 pg_num = m->next_pg->pg_num;
1773         else if (m->current_pg)
1774                 pg_num = m->current_pg->pg_num;
1775         else
1776                 pg_num = (m->nr_priority_groups ? 1 : 0);
1777
1778         DMEMIT("%u ", pg_num);
1779
1780         switch (type) {
1781         case STATUSTYPE_INFO:
1782                 list_for_each_entry(pg, &m->priority_groups, list) {
1783                         if (pg->bypassed)
1784                                 state = 'D';    /* Disabled */
1785                         else if (pg == m->current_pg)
1786                                 state = 'A';    /* Currently Active */
1787                         else
1788                                 state = 'E';    /* Enabled */
1789
1790                         DMEMIT("%c ", state);
1791
1792                         if (pg->ps.type->status)
1793                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1794                                                           result + sz,
1795                                                           maxlen - sz);
1796                         else
1797                                 DMEMIT("0 ");
1798
1799                         DMEMIT("%u %u ", pg->nr_pgpaths,
1800                                pg->ps.type->info_args);
1801
1802                         list_for_each_entry(p, &pg->pgpaths, list) {
1803                                 DMEMIT("%s %s %u ", p->path.dev->name,
1804                                        p->is_active ? "A" : "F",
1805                                        p->fail_count);
1806                                 if (pg->ps.type->status)
1807                                         sz += pg->ps.type->status(&pg->ps,
1808                                               &p->path, type, result + sz,
1809                                               maxlen - sz);
1810                         }
1811                 }
1812                 break;
1813
1814         case STATUSTYPE_TABLE:
1815                 list_for_each_entry(pg, &m->priority_groups, list) {
1816                         DMEMIT("%s ", pg->ps.type->name);
1817
1818                         if (pg->ps.type->status)
1819                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1820                                                           result + sz,
1821                                                           maxlen - sz);
1822                         else
1823                                 DMEMIT("0 ");
1824
1825                         DMEMIT("%u %u ", pg->nr_pgpaths,
1826                                pg->ps.type->table_args);
1827
1828                         list_for_each_entry(p, &pg->pgpaths, list) {
1829                                 DMEMIT("%s ", p->path.dev->name);
1830                                 if (pg->ps.type->status)
1831                                         sz += pg->ps.type->status(&pg->ps,
1832                                               &p->path, type, result + sz,
1833                                               maxlen - sz);
1834                         }
1835                 }
1836                 break;
1837         }
1838
1839         spin_unlock_irqrestore(&m->lock, flags);
1840 }
1841
1842 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
1843                              char *result, unsigned maxlen)
1844 {
1845         int r = -EINVAL;
1846         struct dm_dev *dev;
1847         struct multipath *m = ti->private;
1848         action_fn action;
1849         unsigned long flags;
1850
1851         mutex_lock(&m->work_mutex);
1852
1853         if (dm_suspended(ti)) {
1854                 r = -EBUSY;
1855                 goto out;
1856         }
1857
1858         if (argc == 1) {
1859                 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1860                         r = queue_if_no_path(m, true, false);
1861                         spin_lock_irqsave(&m->lock, flags);
1862                         enable_nopath_timeout(m);
1863                         spin_unlock_irqrestore(&m->lock, flags);
1864                         goto out;
1865                 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1866                         r = queue_if_no_path(m, false, false);
1867                         disable_nopath_timeout(m);
1868                         goto out;
1869                 }
1870         }
1871
1872         if (argc != 2) {
1873                 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1874                 goto out;
1875         }
1876
1877         if (!strcasecmp(argv[0], "disable_group")) {
1878                 r = bypass_pg_num(m, argv[1], true);
1879                 goto out;
1880         } else if (!strcasecmp(argv[0], "enable_group")) {
1881                 r = bypass_pg_num(m, argv[1], false);
1882                 goto out;
1883         } else if (!strcasecmp(argv[0], "switch_group")) {
1884                 r = switch_pg_num(m, argv[1]);
1885                 goto out;
1886         } else if (!strcasecmp(argv[0], "reinstate_path"))
1887                 action = reinstate_path;
1888         else if (!strcasecmp(argv[0], "fail_path"))
1889                 action = fail_path;
1890         else {
1891                 DMWARN("Unrecognised multipath message received: %s", argv[0]);
1892                 goto out;
1893         }
1894
1895         r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1896         if (r) {
1897                 DMWARN("message: error getting device %s",
1898                        argv[1]);
1899                 goto out;
1900         }
1901
1902         r = action_dev(m, dev, action);
1903
1904         dm_put_device(ti, dev);
1905
1906 out:
1907         mutex_unlock(&m->work_mutex);
1908         return r;
1909 }
1910
1911 static int multipath_prepare_ioctl(struct dm_target *ti,
1912                                    struct block_device **bdev)
1913 {
1914         struct multipath *m = ti->private;
1915         struct pgpath *current_pgpath;
1916         int r;
1917
1918         current_pgpath = READ_ONCE(m->current_pgpath);
1919         if (!current_pgpath)
1920                 current_pgpath = choose_pgpath(m, 0);
1921
1922         if (current_pgpath) {
1923                 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
1924                         *bdev = current_pgpath->path.dev->bdev;
1925                         r = 0;
1926                 } else {
1927                         /* pg_init has not started or completed */
1928                         r = -ENOTCONN;
1929                 }
1930         } else {
1931                 /* No path is available */
1932                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1933                         r = -ENOTCONN;
1934                 else
1935                         r = -EIO;
1936         }
1937
1938         if (r == -ENOTCONN) {
1939                 if (!READ_ONCE(m->current_pg)) {
1940                         /* Path status changed, redo selection */
1941                         (void) choose_pgpath(m, 0);
1942                 }
1943                 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1944                         pg_init_all_paths(m);
1945                 dm_table_run_md_queue_async(m->ti->table);
1946                 process_queued_io_list(m);
1947         }
1948
1949         /*
1950          * Only pass ioctls through if the device sizes match exactly.
1951          */
1952         if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
1953                 return 1;
1954         return r;
1955 }
1956
1957 static int multipath_iterate_devices(struct dm_target *ti,
1958                                      iterate_devices_callout_fn fn, void *data)
1959 {
1960         struct multipath *m = ti->private;
1961         struct priority_group *pg;
1962         struct pgpath *p;
1963         int ret = 0;
1964
1965         list_for_each_entry(pg, &m->priority_groups, list) {
1966                 list_for_each_entry(p, &pg->pgpaths, list) {
1967                         ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1968                         if (ret)
1969                                 goto out;
1970                 }
1971         }
1972
1973 out:
1974         return ret;
1975 }
1976
1977 static int pgpath_busy(struct pgpath *pgpath)
1978 {
1979         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1980
1981         return blk_lld_busy(q);
1982 }
1983
1984 /*
1985  * We return "busy", only when we can map I/Os but underlying devices
1986  * are busy (so even if we map I/Os now, the I/Os will wait on
1987  * the underlying queue).
1988  * In other words, if we want to kill I/Os or queue them inside us
1989  * due to map unavailability, we don't return "busy".  Otherwise,
1990  * dm core won't give us the I/Os and we can't do what we want.
1991  */
1992 static int multipath_busy(struct dm_target *ti)
1993 {
1994         bool busy = false, has_active = false;
1995         struct multipath *m = ti->private;
1996         struct priority_group *pg, *next_pg;
1997         struct pgpath *pgpath;
1998
1999         /* pg_init in progress */
2000         if (atomic_read(&m->pg_init_in_progress))
2001                 return true;
2002
2003         /* no paths available, for blk-mq: rely on IO mapping to delay requeue */
2004         if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
2005                 return (m->queue_mode != DM_TYPE_REQUEST_BASED);
2006
2007         /* Guess which priority_group will be used at next mapping time */
2008         pg = READ_ONCE(m->current_pg);
2009         next_pg = READ_ONCE(m->next_pg);
2010         if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
2011                 pg = next_pg;
2012
2013         if (!pg) {
2014                 /*
2015                  * We don't know which pg will be used at next mapping time.
2016                  * We don't call choose_pgpath() here to avoid to trigger
2017                  * pg_init just by busy checking.
2018                  * So we don't know whether underlying devices we will be using
2019                  * at next mapping time are busy or not. Just try mapping.
2020                  */
2021                 return busy;
2022         }
2023
2024         /*
2025          * If there is one non-busy active path at least, the path selector
2026          * will be able to select it. So we consider such a pg as not busy.
2027          */
2028         busy = true;
2029         list_for_each_entry(pgpath, &pg->pgpaths, list) {
2030                 if (pgpath->is_active) {
2031                         has_active = true;
2032                         if (!pgpath_busy(pgpath)) {
2033                                 busy = false;
2034                                 break;
2035                         }
2036                 }
2037         }
2038
2039         if (!has_active) {
2040                 /*
2041                  * No active path in this pg, so this pg won't be used and
2042                  * the current_pg will be changed at next mapping time.
2043                  * We need to try mapping to determine it.
2044                  */
2045                 busy = false;
2046         }
2047
2048         return busy;
2049 }
2050
2051 /*-----------------------------------------------------------------
2052  * Module setup
2053  *---------------------------------------------------------------*/
2054 static struct target_type multipath_target = {
2055         .name = "multipath",
2056         .version = {1, 14, 0},
2057         .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
2058                     DM_TARGET_PASSES_INTEGRITY,
2059         .module = THIS_MODULE,
2060         .ctr = multipath_ctr,
2061         .dtr = multipath_dtr,
2062         .clone_and_map_rq = multipath_clone_and_map,
2063         .release_clone_rq = multipath_release_clone,
2064         .rq_end_io = multipath_end_io,
2065         .map = multipath_map_bio,
2066         .end_io = multipath_end_io_bio,
2067         .presuspend = multipath_presuspend,
2068         .postsuspend = multipath_postsuspend,
2069         .resume = multipath_resume,
2070         .status = multipath_status,
2071         .message = multipath_message,
2072         .prepare_ioctl = multipath_prepare_ioctl,
2073         .iterate_devices = multipath_iterate_devices,
2074         .busy = multipath_busy,
2075 };
2076
2077 static int __init dm_multipath_init(void)
2078 {
2079         int r;
2080
2081         kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
2082         if (!kmultipathd) {
2083                 DMERR("failed to create workqueue kmpathd");
2084                 r = -ENOMEM;
2085                 goto bad_alloc_kmultipathd;
2086         }
2087
2088         /*
2089          * A separate workqueue is used to handle the device handlers
2090          * to avoid overloading existing workqueue. Overloading the
2091          * old workqueue would also create a bottleneck in the
2092          * path of the storage hardware device activation.
2093          */
2094         kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
2095                                                   WQ_MEM_RECLAIM);
2096         if (!kmpath_handlerd) {
2097                 DMERR("failed to create workqueue kmpath_handlerd");
2098                 r = -ENOMEM;
2099                 goto bad_alloc_kmpath_handlerd;
2100         }
2101
2102         r = dm_register_target(&multipath_target);
2103         if (r < 0) {
2104                 DMERR("request-based register failed %d", r);
2105                 r = -EINVAL;
2106                 goto bad_register_target;
2107         }
2108
2109         return 0;
2110
2111 bad_register_target:
2112         destroy_workqueue(kmpath_handlerd);
2113 bad_alloc_kmpath_handlerd:
2114         destroy_workqueue(kmultipathd);
2115 bad_alloc_kmultipathd:
2116         return r;
2117 }
2118
2119 static void __exit dm_multipath_exit(void)
2120 {
2121         destroy_workqueue(kmpath_handlerd);
2122         destroy_workqueue(kmultipathd);
2123
2124         dm_unregister_target(&multipath_target);
2125 }
2126
2127 module_init(dm_multipath_init);
2128 module_exit(dm_multipath_exit);
2129
2130 module_param_named(queue_if_no_path_timeout_secs,
2131                    queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
2132 MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
2133
2134 MODULE_DESCRIPTION(DM_NAME " multipath target");
2135 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
2136 MODULE_LICENSE("GPL");