drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm-core.h"
   9 #include "dm-rq.h"
  10 #include "dm-uevent.h"
  11
  12 #include <linux/init.h>
  13 #include <linux/module.h>
  14 #include <linux/mutex.h>
  15 #include <linux/sched/signal.h>
  16 #include <linux/blkpg.h>
  17 #include <linux/bio.h>
  18 #include <linux/mempool.h>
  19 #include <linux/dax.h>
  20 #include <linux/slab.h>
  21 #include <linux/idr.h>
  22 #include <linux/uio.h>
  23 #include <linux/hdreg.h>
  24 #include <linux/delay.h>
  25 #include <linux/wait.h>
  26 #include <linux/pr.h>
  27 #include <linux/refcount.h>
  28 #include <linux/part_stat.h>
  29 #include <linux/blk-crypto.h>
  30
  31 #define DM_MSG_PREFIX "core"
  32
  33 /*
  34  * Cookies are numeric values sent with CHANGE and REMOVE
  35  * uevents while resuming, removing or renaming the device.
  36  */
  37 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  38 #define DM_COOKIE_LENGTH 24
  39
  40 static const char *_name = DM_NAME;
  41
  42 static unsigned int major = 0;
  43 static unsigned int _major = 0;
  44
  45 static DEFINE_IDR(_minor_idr);
  46
  47 static DEFINE_SPINLOCK(_minor_lock);
  48
  49 static void do_deferred_remove(struct work_struct *w);
  50
  51 static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  52
  53 static struct workqueue_struct *deferred_remove_workqueue;
  54
  55 atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  56 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  57
  58 void dm_issue_global_event(void)
  59 {
  60         atomic_inc(&dm_global_event_nr);
  61         wake_up(&dm_global_eventq);
  62 }
  63
  64 /*
  65  * One of these is allocated (on-stack) per original bio.
  66  */
  67 struct clone_info {
  68         struct dm_table *map;
  69         struct bio *bio;
  70         struct dm_io *io;
  71         sector_t sector;
  72         unsigned sector_count;
  73 };
  74
  75 /*
  76  * One of these is allocated per clone bio.
  77  */
  78 #define DM_TIO_MAGIC 7282014
  79 struct dm_target_io {
  80         unsigned magic;
  81         struct dm_io *io;
  82         struct dm_target *ti;
  83         unsigned target_bio_nr;
  84         unsigned *len_ptr;
  85         bool inside_dm_io;
  86         struct bio clone;
  87 };
  88
  89 /*
  90  * One of these is allocated per original bio.
  91  * It contains the first clone used for that original.
  92  */
  93 #define DM_IO_MAGIC 5191977
  94 struct dm_io {
  95         unsigned magic;
  96         struct mapped_device *md;
  97         blk_status_t status;
  98         atomic_t io_count;
  99         struct bio *orig_bio;
 100         unsigned long start_time;
 101         spinlock_t endio_lock;
 102         struct dm_stats_aux stats_aux;
 103         /* last member of dm_target_io is 'struct bio' */
 104         struct dm_target_io tio;
 105 };
 106
 107 void *dm_per_bio_data(struct bio *bio, size_t data_size)
 108 {
 109         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 110         if (!tio->inside_dm_io)
 111                 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
 112         return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
 113 }
 114 EXPORT_SYMBOL_GPL(dm_per_bio_data);
 115
 116 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 117 {
 118         struct dm_io *io = (struct dm_io *)((char *)data + data_size);
 119         if (io->magic == DM_IO_MAGIC)
 120                 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
 121         BUG_ON(io->magic != DM_TIO_MAGIC);
 122         return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
 123 }
 124 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
 125
 126 unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
 127 {
 128         return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 129 }
 130 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 131
 132 #define MINOR_ALLOCED ((void *)-1)
 133
 134 /*
 135  * Bits for the md->flags field.
 136  */
 137 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 138 #define DMF_SUSPENDED 1
 139 #define DMF_FROZEN 2
 140 #define DMF_FREEING 3
 141 #define DMF_DELETING 4
 142 #define DMF_NOFLUSH_SUSPENDING 5
 143 #define DMF_DEFERRED_REMOVE 6
 144 #define DMF_SUSPENDED_INTERNALLY 7
 145
 146 #define DM_NUMA_NODE NUMA_NO_NODE
 147 static int dm_numa_node = DM_NUMA_NODE;
 148
 149 /*
 150  * For mempools pre-allocation at the table loading time.
 151  */
 152 struct dm_md_mempools {
 153         struct bio_set bs;
 154         struct bio_set io_bs;
 155 };
 156
 157 struct table_device {
 158         struct list_head list;
 159         refcount_t count;
 160         struct dm_dev dm_dev;
 161 };
 162
 163 /*
 164  * Bio-based DM's mempools' reserved IOs set by the user.
 165  */
 166 #define RESERVED_BIO_BASED_IOS          16
 167 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 168
 169 static int __dm_get_module_param_int(int *module_param, int min, int max)
 170 {
 171         int param = READ_ONCE(*module_param);
 172         int modified_param = 0;
 173         bool modified = true;
 174
 175         if (param < min)
 176                 modified_param = min;
 177         else if (param > max)
 178                 modified_param = max;
 179         else
 180                 modified = false;
 181
 182         if (modified) {
 183                 (void)cmpxchg(module_param, param, modified_param);
 184                 param = modified_param;
 185         }
 186
 187         return param;
 188 }
 189
 190 unsigned __dm_get_module_param(unsigned *module_param,
 191                                unsigned def, unsigned max)
 192 {
 193         unsigned param = READ_ONCE(*module_param);
 194         unsigned modified_param = 0;
 195
 196         if (!param)
 197                 modified_param = def;
 198         else if (param > max)
 199                 modified_param = max;
 200
 201         if (modified_param) {
 202                 (void)cmpxchg(module_param, param, modified_param);
 203                 param = modified_param;
 204         }
 205
 206         return param;
 207 }
 208
 209 unsigned dm_get_reserved_bio_based_ios(void)
 210 {
 211         return __dm_get_module_param(&reserved_bio_based_ios,
 212                                      RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 213 }
 214 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 215
 216 static unsigned dm_get_numa_node(void)
 217 {
 218         return __dm_get_module_param_int(&dm_numa_node,
 219                                          DM_NUMA_NODE, num_online_nodes() - 1);
 220 }
 221
 222 static int __init local_init(void)
 223 {
 224         int r;
 225
 226         r = dm_uevent_init();
 227         if (r)
 228                 return r;
 229
 230         deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 231         if (!deferred_remove_workqueue) {
 232                 r = -ENOMEM;
 233                 goto out_uevent_exit;
 234         }
 235
 236         _major = major;
 237         r = register_blkdev(_major, _name);
 238         if (r < 0)
 239                 goto out_free_workqueue;
 240
 241         if (!_major)
 242                 _major = r;
 243
 244         return 0;
 245
 246 out_free_workqueue:
 247         destroy_workqueue(deferred_remove_workqueue);
 248 out_uevent_exit:
 249         dm_uevent_exit();
 250
 251         return r;
 252 }
 253
 254 static void local_exit(void)
 255 {
 256         flush_scheduled_work();
 257         destroy_workqueue(deferred_remove_workqueue);
 258
 259         unregister_blkdev(_major, _name);
 260         dm_uevent_exit();
 261
 262         _major = 0;
 263
 264         DMINFO("cleaned up");
 265 }
 266
 267 static int (*_inits[])(void) __initdata = {
 268         local_init,
 269         dm_target_init,
 270         dm_linear_init,
 271         dm_stripe_init,
 272         dm_io_init,
 273         dm_kcopyd_init,
 274         dm_interface_init,
 275         dm_statistics_init,
 276 };
 277
 278 static void (*_exits[])(void) = {
 279         local_exit,
 280         dm_target_exit,
 281         dm_linear_exit,
 282         dm_stripe_exit,
 283         dm_io_exit,
 284         dm_kcopyd_exit,
 285         dm_interface_exit,
 286         dm_statistics_exit,
 287 };
 288
 289 static int __init dm_init(void)
 290 {
 291         const int count = ARRAY_SIZE(_inits);
 292
 293         int r, i;
 294
 295         for (i = 0; i < count; i++) {
 296                 r = _inits[i]();
 297                 if (r)
 298                         goto bad;
 299         }
 300
 301         return 0;
 302
 303       bad:
 304         while (i--)
 305                 _exits[i]();
 306
 307         return r;
 308 }
 309
 310 static void __exit dm_exit(void)
 311 {
 312         int i = ARRAY_SIZE(_exits);
 313
 314         while (i--)
 315                 _exits[i]();
 316
 317         /*
 318          * Should be empty by this point.
 319          */
 320         idr_destroy(&_minor_idr);
 321 }
 322
 323 /*
 324  * Block device functions
 325  */
 326 int dm_deleting_md(struct mapped_device *md)
 327 {
 328         return test_bit(DMF_DELETING, &md->flags);
 329 }
 330
 331 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 332 {
 333         struct mapped_device *md;
 334
 335         spin_lock(&_minor_lock);
 336
 337         md = bdev->bd_disk->private_data;
 338         if (!md)
 339                 goto out;
 340
 341         if (test_bit(DMF_FREEING, &md->flags) ||
 342             dm_deleting_md(md)) {
 343                 md = NULL;
 344                 goto out;
 345         }
 346
 347         dm_get(md);
 348         atomic_inc(&md->open_count);
 349 out:
 350         spin_unlock(&_minor_lock);
 351
 352         return md ? 0 : -ENXIO;
 353 }
 354
 355 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 356 {
 357         struct mapped_device *md;
 358
 359         spin_lock(&_minor_lock);
 360
 361         md = disk->private_data;
 362         if (WARN_ON(!md))
 363                 goto out;
 364
 365         if (atomic_dec_and_test(&md->open_count) &&
 366             (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 367                 queue_work(deferred_remove_workqueue, &deferred_remove_work);
 368
 369         dm_put(md);
 370 out:
 371         spin_unlock(&_minor_lock);
 372 }
 373
 374 int dm_open_count(struct mapped_device *md)
 375 {
 376         return atomic_read(&md->open_count);
 377 }
 378
 379 /*
 380  * Guarantees nothing is using the device before it's deleted.
 381  */
 382 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 383 {
 384         int r = 0;
 385
 386         spin_lock(&_minor_lock);
 387
 388         if (dm_open_count(md)) {
 389                 r = -EBUSY;
 390                 if (mark_deferred)
 391                         set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 392         } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 393                 r = -EEXIST;
 394         else
 395                 set_bit(DMF_DELETING, &md->flags);
 396
 397         spin_unlock(&_minor_lock);
 398
 399         return r;
 400 }
 401
 402 int dm_cancel_deferred_remove(struct mapped_device *md)
 403 {
 404         int r = 0;
 405
 406         spin_lock(&_minor_lock);
 407
 408         if (test_bit(DMF_DELETING, &md->flags))
 409                 r = -EBUSY;
 410         else
 411                 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 412
 413         spin_unlock(&_minor_lock);
 414
 415         return r;
 416 }
 417
 418 static void do_deferred_remove(struct work_struct *w)
 419 {
 420         dm_deferred_remove();
 421 }
 422
 423 sector_t dm_get_size(struct mapped_device *md)
 424 {
 425         return get_capacity(md->disk);
 426 }
 427
 428 struct request_queue *dm_get_md_queue(struct mapped_device *md)
 429 {
 430         return md->queue;
 431 }
 432
 433 struct dm_stats *dm_get_stats(struct mapped_device *md)
 434 {
 435         return &md->stats;
 436 }
 437
 438 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 439 {
 440         struct mapped_device *md = bdev->bd_disk->private_data;
 441
 442         return dm_get_geometry(md, geo);
 443 }
 444
 445 #ifdef CONFIG_BLK_DEV_ZONED
 446 int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
 447 {
 448         struct dm_report_zones_args *args = data;
 449         sector_t sector_diff = args->tgt->begin - args->start;
 450
 451         /*
 452          * Ignore zones beyond the target range.
 453          */
 454         if (zone->start >= args->start + args->tgt->len)
 455                 return 0;
 456
 457         /*
 458          * Remap the start sector and write pointer position of the zone
 459          * to match its position in the target range.
 460          */
 461         zone->start += sector_diff;
 462         if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
 463                 if (zone->cond == BLK_ZONE_COND_FULL)
 464                         zone->wp = zone->start + zone->len;
 465                 else if (zone->cond == BLK_ZONE_COND_EMPTY)
 466                         zone->wp = zone->start;
 467                 else
 468                         zone->wp += sector_diff;
 469         }
 470
 471         args->next_sector = zone->start + zone->len;
 472         return args->orig_cb(zone, args->zone_idx++, args->orig_data);
 473 }
 474 EXPORT_SYMBOL_GPL(dm_report_zones_cb);
 475
 476 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 477                 unsigned int nr_zones, report_zones_cb cb, void *data)
 478 {
 479         struct mapped_device *md = disk->private_data;
 480         struct dm_table *map;
 481         int srcu_idx, ret;
 482         struct dm_report_zones_args args = {
 483                 .next_sector = sector,
 484                 .orig_data = data,
 485                 .orig_cb = cb,
 486         };
 487
 488         if (dm_suspended_md(md))
 489                 return -EAGAIN;
 490
 491         map = dm_get_live_table(md, &srcu_idx);
 492         if (!map)
 493                 return -EIO;
 494
 495         do {
 496                 struct dm_target *tgt;
 497
 498                 tgt = dm_table_find_target(map, args.next_sector);
 499                 if (WARN_ON_ONCE(!tgt->type->report_zones)) {
 500                         ret = -EIO;
 501                         goto out;
 502                 }
 503
 504                 args.tgt = tgt;
 505                 ret = tgt->type->report_zones(tgt, &args, nr_zones);
 506                 if (ret < 0)
 507                         goto out;
 508         } while (args.zone_idx < nr_zones &&
 509                  args.next_sector < get_capacity(disk));
 510
 511         ret = args.zone_idx;
 512 out:
 513         dm_put_live_table(md, srcu_idx);
 514         return ret;
 515 }
 516 #else
 517 #define dm_blk_report_zones             NULL
 518 #endif /* CONFIG_BLK_DEV_ZONED */
 519
 520 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 521                             struct block_device **bdev)
 522         __acquires(md->io_barrier)
 523 {
 524         struct dm_target *tgt;
 525         struct dm_table *map;
 526         int r;
 527
 528 retry:
 529         r = -ENOTTY;
 530         map = dm_get_live_table(md, srcu_idx);
 531         if (!map || !dm_table_get_size(map))
 532                 return r;
 533
 534         /* We only support devices that have a single target */
 535         if (dm_table_get_num_targets(map) != 1)
 536                 return r;
 537
 538         tgt = dm_table_get_target(map, 0);
 539         if (!tgt->type->prepare_ioctl)
 540                 return r;
 541
 542         if (dm_suspended_md(md))
 543                 return -EAGAIN;
 544
 545         r = tgt->type->prepare_ioctl(tgt, bdev);
 546         if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 547                 dm_put_live_table(md, *srcu_idx);
 548                 msleep(10);
 549                 goto retry;
 550         }
 551
 552         return r;
 553 }
 554
 555 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 556         __releases(md->io_barrier)
 557 {
 558         dm_put_live_table(md, srcu_idx);
 559 }
 560
 561 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 562                         unsigned int cmd, unsigned long arg)
 563 {
 564         struct mapped_device *md = bdev->bd_disk->private_data;
 565         int r, srcu_idx;
 566
 567         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 568         if (r < 0)
 569                 goto out;
 570
 571         if (r > 0) {
 572                 /*
 573                  * Target determined this ioctl is being issued against a
 574                  * subset of the parent bdev; require extra privileges.
 575                  */
 576                 if (!capable(CAP_SYS_RAWIO)) {
 577                         DMWARN_LIMIT(
 578         "%s: sending ioctl %x to DM device without required privilege.",
 579                                 current->comm, cmd);
 580                         r = -ENOIOCTLCMD;
 581                         goto out;
 582                 }
 583         }
 584
 585         r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 586 out:
 587         dm_unprepare_ioctl(md, srcu_idx);
 588         return r;
 589 }
 590
 591 static void start_io_acct(struct dm_io *io);
 592
 593 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 594 {
 595         struct dm_io *io;
 596         struct dm_target_io *tio;
 597         struct bio *clone;
 598
 599         clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
 600         if (!clone)
 601                 return NULL;
 602
 603         tio = container_of(clone, struct dm_target_io, clone);
 604         tio->inside_dm_io = true;
 605         tio->io = NULL;
 606
 607         io = container_of(tio, struct dm_io, tio);
 608         io->magic = DM_IO_MAGIC;
 609         io->status = 0;
 610         atomic_set(&io->io_count, 1);
 611         io->orig_bio = bio;
 612         io->md = md;
 613         spin_lock_init(&io->endio_lock);
 614
 615         start_io_acct(io);
 616
 617         return io;
 618 }
 619
 620 static void free_io(struct mapped_device *md, struct dm_io *io)
 621 {
 622         bio_put(&io->tio.clone);
 623 }
 624
 625 static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 626                                       unsigned target_bio_nr, gfp_t gfp_mask)
 627 {
 628         struct dm_target_io *tio;
 629
 630         if (!ci->io->tio.io) {
 631                 /* the dm_target_io embedded in ci->io is available */
 632                 tio = &ci->io->tio;
 633         } else {
 634                 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
 635                 if (!clone)
 636                         return NULL;
 637
 638                 tio = container_of(clone, struct dm_target_io, clone);
 639                 tio->inside_dm_io = false;
 640         }
 641
 642         tio->magic = DM_TIO_MAGIC;
 643         tio->io = ci->io;
 644         tio->ti = ti;
 645         tio->target_bio_nr = target_bio_nr;
 646
 647         return tio;
 648 }
 649
 650 static void free_tio(struct dm_target_io *tio)
 651 {
 652         if (tio->inside_dm_io)
 653                 return;
 654         bio_put(&tio->clone);
 655 }
 656
 657 static bool md_in_flight_bios(struct mapped_device *md)
 658 {
 659         int cpu;
 660         struct hd_struct *part = &dm_disk(md)->part0;
 661         long sum = 0;
 662
 663         for_each_possible_cpu(cpu) {
 664                 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
 665                 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
 666         }
 667
 668         return sum != 0;
 669 }
 670
 671 static bool md_in_flight(struct mapped_device *md)
 672 {
 673         if (queue_is_mq(md->queue))
 674                 return blk_mq_queue_inflight(md->queue);
 675         else
 676                 return md_in_flight_bios(md);
 677 }
 678
 679 u64 dm_start_time_ns_from_clone(struct bio *bio)
 680 {
 681         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 682         struct dm_io *io = tio->io;
 683
 684         return jiffies_to_nsecs(io->start_time);
 685 }
 686 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 687
 688 static void start_io_acct(struct dm_io *io)
 689 {
 690         struct mapped_device *md = io->md;
 691         struct bio *bio = io->orig_bio;
 692
 693         io->start_time = bio_start_io_acct(bio);
 694         if (unlikely(dm_stats_used(&md->stats)))
 695                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 696                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 697                                     false, 0, &io->stats_aux);
 698 }
 699
 700 static void end_io_acct(struct dm_io *io)
 701 {
 702         struct mapped_device *md = io->md;
 703         struct bio *bio = io->orig_bio;
 704         unsigned long duration = jiffies - io->start_time;
 705
 706         bio_end_io_acct(bio, io->start_time);
 707
 708         if (unlikely(dm_stats_used(&md->stats)))
 709                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 710                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 711                                     true, duration, &io->stats_aux);
 712
 713         /* nudge anyone waiting on suspend queue */
 714         if (unlikely(wq_has_sleeper(&md->wait)))
 715                 wake_up(&md->wait);
 716 }
 717
 718 /*
 719  * Add the bio to the list of deferred io.
 720  */
 721 static void queue_io(struct mapped_device *md, struct bio *bio)
 722 {
 723         unsigned long flags;
 724
 725         spin_lock_irqsave(&md->deferred_lock, flags);
 726         bio_list_add(&md->deferred, bio);
 727         spin_unlock_irqrestore(&md->deferred_lock, flags);
 728         queue_work(md->wq, &md->work);
 729 }
 730
 731 /*
 732  * Everyone (including functions in this file), should use this
 733  * function to access the md->map field, and make sure they call
 734  * dm_put_live_table() when finished.
 735  */
 736 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 737 {
 738         *srcu_idx = srcu_read_lock(&md->io_barrier);
 739
 740         return srcu_dereference(md->map, &md->io_barrier);
 741 }
 742
 743 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 744 {
 745         srcu_read_unlock(&md->io_barrier, srcu_idx);
 746 }
 747
 748 void dm_sync_table(struct mapped_device *md)
 749 {
 750         synchronize_srcu(&md->io_barrier);
 751         synchronize_rcu_expedited();
 752 }
 753
 754 /*
 755  * A fast alternative to dm_get_live_table/dm_put_live_table.
 756  * The caller must not block between these two functions.
 757  */
 758 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 759 {
 760         rcu_read_lock();
 761         return rcu_dereference(md->map);
 762 }
 763
 764 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 765 {
 766         rcu_read_unlock();
 767 }
 768
 769 static char *_dm_claim_ptr = "I belong to device-mapper";
 770
 771 /*
 772  * Open a table device so we can use it as a map destination.
 773  */
 774 static int open_table_device(struct table_device *td, dev_t dev,
 775                              struct mapped_device *md)
 776 {
 777         struct block_device *bdev;
 778
 779         int r;
 780
 781         BUG_ON(td->dm_dev.bdev);
 782
 783         bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 784         if (IS_ERR(bdev))
 785                 return PTR_ERR(bdev);
 786
 787         r = bd_link_disk_holder(bdev, dm_disk(md));
 788         if (r) {
 789                 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 790                 return r;
 791         }
 792
 793         td->dm_dev.bdev = bdev;
 794         td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 795         return 0;
 796 }
 797
 798 /*
 799  * Close a table device that we've been using.
 800  */
 801 static void close_table_device(struct table_device *td, struct mapped_device *md)
 802 {
 803         if (!td->dm_dev.bdev)
 804                 return;
 805
 806         bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 807         blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 808         put_dax(td->dm_dev.dax_dev);
 809         td->dm_dev.bdev = NULL;
 810         td->dm_dev.dax_dev = NULL;
 811 }
 812
 813 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 814                                               fmode_t mode)
 815 {
 816         struct table_device *td;
 817
 818         list_for_each_entry(td, l, list)
 819                 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 820                         return td;
 821
 822         return NULL;
 823 }
 824
 825 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 826                         struct dm_dev **result)
 827 {
 828         int r;
 829         struct table_device *td;
 830
 831         mutex_lock(&md->table_devices_lock);
 832         td = find_table_device(&md->table_devices, dev, mode);
 833         if (!td) {
 834                 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 835                 if (!td) {
 836                         mutex_unlock(&md->table_devices_lock);
 837                         return -ENOMEM;
 838                 }
 839
 840                 td->dm_dev.mode = mode;
 841                 td->dm_dev.bdev = NULL;
 842
 843                 if ((r = open_table_device(td, dev, md))) {
 844                         mutex_unlock(&md->table_devices_lock);
 845                         kfree(td);
 846                         return r;
 847                 }
 848
 849                 format_dev_t(td->dm_dev.name, dev);
 850
 851                 refcount_set(&td->count, 1);
 852                 list_add(&td->list, &md->table_devices);
 853         } else {
 854                 refcount_inc(&td->count);
 855         }
 856         mutex_unlock(&md->table_devices_lock);
 857
 858         *result = &td->dm_dev;
 859         return 0;
 860 }
 861 EXPORT_SYMBOL_GPL(dm_get_table_device);
 862
 863 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 864 {
 865         struct table_device *td = container_of(d, struct table_device, dm_dev);
 866
 867         mutex_lock(&md->table_devices_lock);
 868         if (refcount_dec_and_test(&td->count)) {
 869                 close_table_device(td, md);
 870                 list_del(&td->list);
 871                 kfree(td);
 872         }
 873         mutex_unlock(&md->table_devices_lock);
 874 }
 875 EXPORT_SYMBOL(dm_put_table_device);
 876
 877 static void free_table_devices(struct list_head *devices)
 878 {
 879         struct list_head *tmp, *next;
 880
 881         list_for_each_safe(tmp, next, devices) {
 882                 struct table_device *td = list_entry(tmp, struct table_device, list);
 883
 884                 DMWARN("dm_destroy: %s still exists with %d references",
 885                        td->dm_dev.name, refcount_read(&td->count));
 886                 kfree(td);
 887         }
 888 }
 889
 890 /*
 891  * Get the geometry associated with a dm device
 892  */
 893 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 894 {
 895         *geo = md->geometry;
 896
 897         return 0;
 898 }
 899
 900 /*
 901  * Set the geometry of a device.
 902  */
 903 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 904 {
 905         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 906
 907         if (geo->start > sz) {
 908                 DMWARN("Start sector is beyond the geometry limits.");
 909                 return -EINVAL;
 910         }
 911
 912         md->geometry = *geo;
 913
 914         return 0;
 915 }
 916
 917 static int __noflush_suspending(struct mapped_device *md)
 918 {
 919         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 920 }
 921
 922 /*
 923  * Decrements the number of outstanding ios that a bio has been
 924  * cloned into, completing the original io if necc.
 925  */
 926 static void dec_pending(struct dm_io *io, blk_status_t error)
 927 {
 928         unsigned long flags;
 929         blk_status_t io_error;
 930         struct bio *bio;
 931         struct mapped_device *md = io->md;
 932
 933         /* Push-back supersedes any I/O errors */
 934         if (unlikely(error)) {
 935                 spin_lock_irqsave(&io->endio_lock, flags);
 936                 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
 937                         io->status = error;
 938                 spin_unlock_irqrestore(&io->endio_lock, flags);
 939         }
 940
 941         if (atomic_dec_and_test(&io->io_count)) {
 942                 if (io->status == BLK_STS_DM_REQUEUE) {
 943                         /*
 944                          * Target requested pushing back the I/O.
 945                          */
 946                         spin_lock_irqsave(&md->deferred_lock, flags);
 947                         if (__noflush_suspending(md))
 948                                 /* NOTE early return due to BLK_STS_DM_REQUEUE below */
 949                                 bio_list_add_head(&md->deferred, io->orig_bio);
 950                         else
 951                                 /* noflush suspend was interrupted. */
 952                                 io->status = BLK_STS_IOERR;
 953                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 954                 }
 955
 956                 io_error = io->status;
 957                 bio = io->orig_bio;
 958                 end_io_acct(io);
 959                 free_io(md, io);
 960
 961                 if (io_error == BLK_STS_DM_REQUEUE)
 962                         return;
 963
 964                 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 965                         /*
 966                          * Preflush done for flush with data, reissue
 967                          * without REQ_PREFLUSH.
 968                          */
 969                         bio->bi_opf &= ~REQ_PREFLUSH;
 970                         queue_io(md, bio);
 971                 } else {
 972                         /* done with normal IO or empty flush */
 973                         if (io_error)
 974                                 bio->bi_status = io_error;
 975                         bio_endio(bio);
 976                 }
 977         }
 978 }
 979
 980 void disable_discard(struct mapped_device *md)
 981 {
 982         struct queue_limits *limits = dm_get_queue_limits(md);
 983
 984         /* device doesn't really support DISCARD, disable it */
 985         limits->max_discard_sectors = 0;
 986         blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
 987 }
 988
 989 void disable_write_same(struct mapped_device *md)
 990 {
 991         struct queue_limits *limits = dm_get_queue_limits(md);
 992
 993         /* device doesn't really support WRITE SAME, disable it */
 994         limits->max_write_same_sectors = 0;
 995 }
 996
 997 void disable_write_zeroes(struct mapped_device *md)
 998 {
 999         struct queue_limits *limits = dm_get_queue_limits(md);
1000
1001         /* device doesn't really support WRITE ZEROES, disable it */
1002         limits->max_write_zeroes_sectors = 0;
1003 }
1004
1005 static void clone_endio(struct bio *bio)
1006 {
1007         blk_status_t error = bio->bi_status;
1008         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1009         struct dm_io *io = tio->io;
1010         struct mapped_device *md = tio->io->md;
1011         dm_endio_fn endio = tio->ti->type->end_io;
1012
1013         if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
1014                 if (bio_op(bio) == REQ_OP_DISCARD &&
1015                     !bio->bi_disk->queue->limits.max_discard_sectors)
1016                         disable_discard(md);
1017                 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
1018                          !bio->bi_disk->queue->limits.max_write_same_sectors)
1019                         disable_write_same(md);
1020                 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1021                          !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
1022                         disable_write_zeroes(md);
1023         }
1024
1025         if (endio) {
1026                 int r = endio(tio->ti, bio, &error);
1027                 switch (r) {
1028                 case DM_ENDIO_REQUEUE:
1029                         error = BLK_STS_DM_REQUEUE;
1030                         /*FALLTHRU*/
1031                 case DM_ENDIO_DONE:
1032                         break;
1033                 case DM_ENDIO_INCOMPLETE:
1034                         /* The target will handle the io */
1035                         return;
1036                 default:
1037                         DMWARN("unimplemented target endio return value: %d", r);
1038                         BUG();
1039                 }
1040         }
1041
1042         free_tio(tio);
1043         dec_pending(io, error);
1044 }
1045
1046 /*
1047  * Return maximum size of I/O possible at the supplied sector up to the current
1048  * target boundary.
1049  */
1050 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1051 {
1052         sector_t target_offset = dm_target_offset(ti, sector);
1053
1054         return ti->len - target_offset;
1055 }
1056
1057 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1058 {
1059         sector_t len = max_io_len_target_boundary(sector, ti);
1060         sector_t offset, max_len;
1061
1062         /*
1063          * Does the target need to split even further?
1064          */
1065         if (ti->max_io_len) {
1066                 offset = dm_target_offset(ti, sector);
1067                 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1068                         max_len = sector_div(offset, ti->max_io_len);
1069                 else
1070                         max_len = offset & (ti->max_io_len - 1);
1071                 max_len = ti->max_io_len - max_len;
1072
1073                 if (len > max_len)
1074                         len = max_len;
1075         }
1076
1077         return len;
1078 }
1079
1080 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1081 {
1082         if (len > UINT_MAX) {
1083                 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1084                       (unsigned long long)len, UINT_MAX);
1085                 ti->error = "Maximum size of target IO is too large";
1086                 return -EINVAL;
1087         }
1088
1089         ti->max_io_len = (uint32_t) len;
1090
1091         return 0;
1092 }
1093 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1094
1095 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1096                                                 sector_t sector, int *srcu_idx)
1097         __acquires(md->io_barrier)
1098 {
1099         struct dm_table *map;
1100         struct dm_target *ti;
1101
1102         map = dm_get_live_table(md, srcu_idx);
1103         if (!map)
1104                 return NULL;
1105
1106         ti = dm_table_find_target(map, sector);
1107         if (!ti)
1108                 return NULL;
1109
1110         return ti;
1111 }
1112
1113 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1114                                  long nr_pages, void **kaddr, pfn_t *pfn)
1115 {
1116         struct mapped_device *md = dax_get_private(dax_dev);
1117         sector_t sector = pgoff * PAGE_SECTORS;
1118         struct dm_target *ti;
1119         long len, ret = -EIO;
1120         int srcu_idx;
1121
1122         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1123
1124         if (!ti)
1125                 goto out;
1126         if (!ti->type->direct_access)
1127                 goto out;
1128         len = max_io_len(sector, ti) / PAGE_SECTORS;
1129         if (len < 1)
1130                 goto out;
1131         nr_pages = min(len, nr_pages);
1132         ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1133
1134  out:
1135         dm_put_live_table(md, srcu_idx);
1136
1137         return ret;
1138 }
1139
1140 static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1141                 int blocksize, sector_t start, sector_t len)
1142 {
1143         struct mapped_device *md = dax_get_private(dax_dev);
1144         struct dm_table *map;
1145         int srcu_idx;
1146         bool ret;
1147
1148         map = dm_get_live_table(md, &srcu_idx);
1149         if (!map)
1150                 return false;
1151
1152         ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
1153
1154         dm_put_live_table(md, srcu_idx);
1155
1156         return ret;
1157 }
1158
1159 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1160                                     void *addr, size_t bytes, struct iov_iter *i)
1161 {
1162         struct mapped_device *md = dax_get_private(dax_dev);
1163         sector_t sector = pgoff * PAGE_SECTORS;
1164         struct dm_target *ti;
1165         long ret = 0;
1166         int srcu_idx;
1167
1168         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1169
1170         if (!ti)
1171                 goto out;
1172         if (!ti->type->dax_copy_from_iter) {
1173                 ret = copy_from_iter(addr, bytes, i);
1174                 goto out;
1175         }
1176         ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1177  out:
1178         dm_put_live_table(md, srcu_idx);
1179
1180         return ret;
1181 }
1182
1183 static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1184                 void *addr, size_t bytes, struct iov_iter *i)
1185 {
1186         struct mapped_device *md = dax_get_private(dax_dev);
1187         sector_t sector = pgoff * PAGE_SECTORS;
1188         struct dm_target *ti;
1189         long ret = 0;
1190         int srcu_idx;
1191
1192         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1193
1194         if (!ti)
1195                 goto out;
1196         if (!ti->type->dax_copy_to_iter) {
1197                 ret = copy_to_iter(addr, bytes, i);
1198                 goto out;
1199         }
1200         ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1201  out:
1202         dm_put_live_table(md, srcu_idx);
1203
1204         return ret;
1205 }
1206
1207 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1208                                   size_t nr_pages)
1209 {
1210         struct mapped_device *md = dax_get_private(dax_dev);
1211         sector_t sector = pgoff * PAGE_SECTORS;
1212         struct dm_target *ti;
1213         int ret = -EIO;
1214         int srcu_idx;
1215
1216         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1217
1218         if (!ti)
1219                 goto out;
1220         if (WARN_ON(!ti->type->dax_zero_page_range)) {
1221                 /*
1222                  * ->zero_page_range() is mandatory dax operation. If we are
1223                  *  here, something is wrong.
1224                  */
1225                 dm_put_live_table(md, srcu_idx);
1226                 goto out;
1227         }
1228         ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1229
1230  out:
1231         dm_put_live_table(md, srcu_idx);
1232
1233         return ret;
1234 }
1235
1236 /*
1237  * A target may call dm_accept_partial_bio only from the map routine.  It is
1238  * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
1239  * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH.
1240  *
1241  * dm_accept_partial_bio informs the dm that the target only wants to process
1242  * additional n_sectors sectors of the bio and the rest of the data should be
1243  * sent in a next bio.
1244  *
1245  * A diagram that explains the arithmetics:
1246  * +--------------------+---------------+-------+
1247  * |         1          |       2       |   3   |
1248  * +--------------------+---------------+-------+
1249  *
1250  * <-------------- *tio->len_ptr --------------->
1251  *                      <------- bi_size ------->
1252  *                      <-- n_sectors -->
1253  *
1254  * Region 1 was already iterated over with bio_advance or similar function.
1255  *      (it may be empty if the target doesn't use bio_advance)
1256  * Region 2 is the remaining bio size that the target wants to process.
1257  *      (it may be empty if region 1 is non-empty, although there is no reason
1258  *       to make it empty)
1259  * The target requires that region 3 is to be sent in the next bio.
1260  *
1261  * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1262  * the partially processed part (the sum of regions 1+2) must be the same for all
1263  * copies of the bio.
1264  */
1265 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1266 {
1267         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1268         unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1269         BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1270         BUG_ON(bi_size > *tio->len_ptr);
1271         BUG_ON(n_sectors > bi_size);
1272         *tio->len_ptr -= bi_size - n_sectors;
1273         bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1274 }
1275 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1276
1277 static blk_qc_t __map_bio(struct dm_target_io *tio)
1278 {
1279         int r;
1280         sector_t sector;
1281         struct bio *clone = &tio->clone;
1282         struct dm_io *io = tio->io;
1283         struct mapped_device *md = io->md;
1284         struct dm_target *ti = tio->ti;
1285         blk_qc_t ret = BLK_QC_T_NONE;
1286
1287         clone->bi_end_io = clone_endio;
1288
1289         /*
1290          * Map the clone.  If r == 0 we don't need to do
1291          * anything, the target has assumed ownership of
1292          * this io.
1293          */
1294         atomic_inc(&io->io_count);
1295         sector = clone->bi_iter.bi_sector;
1296
1297         r = ti->type->map(ti, clone);
1298         switch (r) {
1299         case DM_MAPIO_SUBMITTED:
1300                 break;
1301         case DM_MAPIO_REMAPPED:
1302                 /* the bio has been remapped so dispatch it */
1303                 trace_block_bio_remap(clone->bi_disk->queue, clone,
1304                                       bio_dev(io->orig_bio), sector);
1305                 if (md->type == DM_TYPE_NVME_BIO_BASED)
1306                         ret = direct_make_request(clone);
1307                 else
1308                         ret = generic_make_request(clone);
1309                 break;
1310         case DM_MAPIO_KILL:
1311                 free_tio(tio);
1312                 dec_pending(io, BLK_STS_IOERR);
1313                 break;
1314         case DM_MAPIO_REQUEUE:
1315                 free_tio(tio);
1316                 dec_pending(io, BLK_STS_DM_REQUEUE);
1317                 break;
1318         default:
1319                 DMWARN("unimplemented target map return value: %d", r);
1320                 BUG();
1321         }
1322
1323         return ret;
1324 }
1325
1326 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1327 {
1328         bio->bi_iter.bi_sector = sector;
1329         bio->bi_iter.bi_size = to_bytes(len);
1330 }
1331
1332 /*
1333  * Creates a bio that consists of range of complete bvecs.
1334  */
1335 static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1336                      sector_t sector, unsigned len)
1337 {
1338         struct bio *clone = &tio->clone;
1339
1340         __bio_clone_fast(clone, bio);
1341
1342         bio_crypt_clone(clone, bio, GFP_NOIO);
1343
1344         if (bio_integrity(bio)) {
1345                 int r;
1346
1347                 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1348                              !dm_target_passes_integrity(tio->ti->type))) {
1349                         DMWARN("%s: the target %s doesn't support integrity data.",
1350                                 dm_device_name(tio->io->md),
1351                                 tio->ti->type->name);
1352                         return -EIO;
1353                 }
1354
1355                 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1356                 if (r < 0)
1357                         return r;
1358         }
1359
1360         bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1361         clone->bi_iter.bi_size = to_bytes(len);
1362
1363         if (bio_integrity(bio))
1364                 bio_integrity_trim(clone);
1365
1366         return 0;
1367 }
1368
1369 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1370                                 struct dm_target *ti, unsigned num_bios)
1371 {
1372         struct dm_target_io *tio;
1373         int try;
1374
1375         if (!num_bios)
1376                 return;
1377
1378         if (num_bios == 1) {
1379                 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1380                 bio_list_add(blist, &tio->clone);
1381                 return;
1382         }
1383
1384         for (try = 0; try < 2; try++) {
1385                 int bio_nr;
1386                 struct bio *bio;
1387
1388                 if (try)
1389                         mutex_lock(&ci->io->md->table_devices_lock);
1390                 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1391                         tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1392                         if (!tio)
1393                                 break;
1394
1395                         bio_list_add(blist, &tio->clone);
1396                 }
1397                 if (try)
1398                         mutex_unlock(&ci->io->md->table_devices_lock);
1399                 if (bio_nr == num_bios)
1400                         return;
1401
1402                 while ((bio = bio_list_pop(blist))) {
1403                         tio = container_of(bio, struct dm_target_io, clone);
1404                         free_tio(tio);
1405                 }
1406         }
1407 }
1408
1409 static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1410                                            struct dm_target_io *tio, unsigned *len)
1411 {
1412         struct bio *clone = &tio->clone;
1413
1414         tio->len_ptr = len;
1415
1416         __bio_clone_fast(clone, ci->bio);
1417         if (len)
1418                 bio_setup_sector(clone, ci->sector, *len);
1419
1420         return __map_bio(tio);
1421 }
1422
1423 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1424                                   unsigned num_bios, unsigned *len)
1425 {
1426         struct bio_list blist = BIO_EMPTY_LIST;
1427         struct bio *bio;
1428         struct dm_target_io *tio;
1429
1430         alloc_multiple_bios(&blist, ci, ti, num_bios);
1431
1432         while ((bio = bio_list_pop(&blist))) {
1433                 tio = container_of(bio, struct dm_target_io, clone);
1434                 (void) __clone_and_map_simple_bio(ci, tio, len);
1435         }
1436 }
1437
1438 static int __send_empty_flush(struct clone_info *ci)
1439 {
1440         unsigned target_nr = 0;
1441         struct dm_target *ti;
1442
1443         /*
1444          * Empty flush uses a statically initialized bio, as the base for
1445          * cloning.  However, blkg association requires that a bdev is
1446          * associated with a gendisk, which doesn't happen until the bdev is
1447          * opened.  So, blkg association is done at issue time of the flush
1448          * rather than when the device is created in alloc_dev().
1449          */
1450         bio_set_dev(ci->bio, ci->io->md->bdev);
1451
1452         BUG_ON(bio_has_data(ci->bio));
1453         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1454                 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1455
1456         bio_disassociate_blkg(ci->bio);
1457
1458         return 0;
1459 }
1460
1461 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1462                                     sector_t sector, unsigned *len)
1463 {
1464         struct bio *bio = ci->bio;
1465         struct dm_target_io *tio;
1466         int r;
1467
1468         tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1469         tio->len_ptr = len;
1470         r = clone_bio(tio, bio, sector, *len);
1471         if (r < 0) {
1472                 free_tio(tio);
1473                 return r;
1474         }
1475         (void) __map_bio(tio);
1476
1477         return 0;
1478 }
1479
1480 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1481
1482 static unsigned get_num_discard_bios(struct dm_target *ti)
1483 {
1484         return ti->num_discard_bios;
1485 }
1486
1487 static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1488 {
1489         return ti->num_secure_erase_bios;
1490 }
1491
1492 static unsigned get_num_write_same_bios(struct dm_target *ti)
1493 {
1494         return ti->num_write_same_bios;
1495 }
1496
1497 static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1498 {
1499         return ti->num_write_zeroes_bios;
1500 }
1501
1502 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1503                                        unsigned num_bios)
1504 {
1505         unsigned len;
1506
1507         /*
1508          * Even though the device advertised support for this type of
1509          * request, that does not mean every target supports it, and
1510          * reconfiguration might also have changed that since the
1511          * check was performed.
1512          */
1513         if (!num_bios)
1514                 return -EOPNOTSUPP;
1515
1516         len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1517
1518         __send_duplicate_bios(ci, ti, num_bios, &len);
1519
1520         ci->sector += len;
1521         ci->sector_count -= len;
1522
1523         return 0;
1524 }
1525
1526 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1527 {
1528         return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
1529 }
1530
1531 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1532 {
1533         return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
1534 }
1535
1536 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1537 {
1538         return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
1539 }
1540
1541 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1542 {
1543         return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
1544 }
1545
1546 static bool is_abnormal_io(struct bio *bio)
1547 {
1548         bool r = false;
1549
1550         switch (bio_op(bio)) {
1551         case REQ_OP_DISCARD:
1552         case REQ_OP_SECURE_ERASE:
1553         case REQ_OP_WRITE_SAME:
1554         case REQ_OP_WRITE_ZEROES:
1555                 r = true;
1556                 break;
1557         }
1558
1559         return r;
1560 }
1561
1562 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1563                                   int *result)
1564 {
1565         struct bio *bio = ci->bio;
1566
1567         if (bio_op(bio) == REQ_OP_DISCARD)
1568                 *result = __send_discard(ci, ti);
1569         else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1570                 *result = __send_secure_erase(ci, ti);
1571         else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1572                 *result = __send_write_same(ci, ti);
1573         else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1574                 *result = __send_write_zeroes(ci, ti);
1575         else
1576                 return false;
1577
1578         return true;
1579 }
1580
1581 /*
1582  * Select the correct strategy for processing a non-flush bio.
1583  */
1584 static int __split_and_process_non_flush(struct clone_info *ci)
1585 {
1586         struct dm_target *ti;
1587         unsigned len;
1588         int r;
1589
1590         ti = dm_table_find_target(ci->map, ci->sector);
1591         if (!ti)
1592                 return -EIO;
1593
1594         if (__process_abnormal_io(ci, ti, &r))
1595                 return r;
1596
1597         len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1598
1599         r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1600         if (r < 0)
1601                 return r;
1602
1603         ci->sector += len;
1604         ci->sector_count -= len;
1605
1606         return 0;
1607 }
1608
1609 static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1610                             struct dm_table *map, struct bio *bio)
1611 {
1612         ci->map = map;
1613         ci->io = alloc_io(md, bio);
1614         ci->sector = bio->bi_iter.bi_sector;
1615 }
1616
1617 #define __dm_part_stat_sub(part, field, subnd)  \
1618         (part_stat_get(part, field) -= (subnd))
1619
1620 /*
1621  * Entry point to split a bio into clones and submit them to the targets.
1622  */
1623 static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1624                                         struct dm_table *map, struct bio *bio)
1625 {
1626         struct clone_info ci;
1627         blk_qc_t ret = BLK_QC_T_NONE;
1628         int error = 0;
1629
1630         init_clone_info(&ci, md, map, bio);
1631
1632         if (bio->bi_opf & REQ_PREFLUSH) {
1633                 struct bio flush_bio;
1634
1635                 /*
1636                  * Use an on-stack bio for this, it's safe since we don't
1637                  * need to reference it after submit. It's just used as
1638                  * the basis for the clone(s).
1639                  */
1640                 bio_init(&flush_bio, NULL, 0);
1641                 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1642                 ci.bio = &flush_bio;
1643                 ci.sector_count = 0;
1644                 error = __send_empty_flush(&ci);
1645                 /* dec_pending submits any data associated with flush */
1646         } else if (op_is_zone_mgmt(bio_op(bio))) {
1647                 ci.bio = bio;
1648                 ci.sector_count = 0;
1649                 error = __split_and_process_non_flush(&ci);
1650         } else {
1651                 ci.bio = bio;
1652                 ci.sector_count = bio_sectors(bio);
1653                 while (ci.sector_count && !error) {
1654                         error = __split_and_process_non_flush(&ci);
1655                         if (current->bio_list && ci.sector_count && !error) {
1656                                 /*
1657                                  * Remainder must be passed to generic_make_request()
1658                                  * so that it gets handled *after* bios already submitted
1659                                  * have been completely processed.
1660                                  * We take a clone of the original to store in
1661                                  * ci.io->orig_bio to be used by end_io_acct() and
1662                                  * for dec_pending to use for completion handling.
1663                                  */
1664                                 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1665                                                           GFP_NOIO, &md->queue->bio_split);
1666                                 ci.io->orig_bio = b;
1667
1668                                 /*
1669                                  * Adjust IO stats for each split, otherwise upon queue
1670                                  * reentry there will be redundant IO accounting.
1671                                  * NOTE: this is a stop-gap fix, a proper fix involves
1672                                  * significant refactoring of DM core's bio splitting
1673                                  * (by eliminating DM's splitting and just using bio_split)
1674                                  */
1675                                 part_stat_lock();
1676                                 __dm_part_stat_sub(&dm_disk(md)->part0,
1677                                                    sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1678                                 part_stat_unlock();
1679
1680                                 bio_chain(b, bio);
1681                                 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1682                                 ret = generic_make_request(bio);
1683                                 break;
1684                         }
1685                 }
1686         }
1687
1688         /* drop the extra reference count */
1689         dec_pending(ci.io, errno_to_blk_status(error));
1690         return ret;
1691 }
1692
1693 /*
1694  * Optimized variant of __split_and_process_bio that leverages the
1695  * fact that targets that use it do _not_ have a need to split bios.
1696  */
1697 static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
1698                               struct bio *bio, struct dm_target *ti)
1699 {
1700         struct clone_info ci;
1701         blk_qc_t ret = BLK_QC_T_NONE;
1702         int error = 0;
1703
1704         init_clone_info(&ci, md, map, bio);
1705
1706         if (bio->bi_opf & REQ_PREFLUSH) {
1707                 struct bio flush_bio;
1708
1709                 /*
1710                  * Use an on-stack bio for this, it's safe since we don't
1711                  * need to reference it after submit. It's just used as
1712                  * the basis for the clone(s).
1713                  */
1714                 bio_init(&flush_bio, NULL, 0);
1715                 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1716                 ci.bio = &flush_bio;
1717                 ci.sector_count = 0;
1718                 error = __send_empty_flush(&ci);
1719                 /* dec_pending submits any data associated with flush */
1720         } else {
1721                 struct dm_target_io *tio;
1722
1723                 ci.bio = bio;
1724                 ci.sector_count = bio_sectors(bio);
1725                 if (__process_abnormal_io(&ci, ti, &error))
1726                         goto out;
1727
1728                 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1729                 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1730         }
1731 out:
1732         /* drop the extra reference count */
1733         dec_pending(ci.io, errno_to_blk_status(error));
1734         return ret;
1735 }
1736
1737 static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
1738 {
1739         unsigned len, sector_count;
1740
1741         sector_count = bio_sectors(*bio);
1742         len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
1743
1744         if (sector_count > len) {
1745                 struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
1746
1747                 bio_chain(split, *bio);
1748                 trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
1749                 generic_make_request(*bio);
1750                 *bio = split;
1751         }
1752 }
1753
1754 static blk_qc_t dm_process_bio(struct mapped_device *md,
1755                                struct dm_table *map, struct bio *bio)
1756 {
1757         blk_qc_t ret = BLK_QC_T_NONE;
1758         struct dm_target *ti = md->immutable_target;
1759
1760         if (unlikely(!map)) {
1761                 bio_io_error(bio);
1762                 return ret;
1763         }
1764
1765         if (!ti) {
1766                 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1767                 if (unlikely(!ti)) {
1768                         bio_io_error(bio);
1769                         return ret;
1770                 }
1771         }
1772
1773         /*
1774          * If in ->make_request_fn we need to use blk_queue_split(), otherwise
1775          * queue_limits for abnormal requests (e.g. discard, writesame, etc)
1776          * won't be imposed.
1777          */
1778         if (current->bio_list) {
1779                 if (is_abnormal_io(bio))
1780                         blk_queue_split(md->queue, &bio);
1781                 else
1782                         dm_queue_split(md, ti, &bio);
1783         }
1784
1785         if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
1786                 return __process_bio(md, map, bio, ti);
1787         else
1788                 return __split_and_process_bio(md, map, bio);
1789 }
1790
1791 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1792 {
1793         struct mapped_device *md = q->queuedata;
1794         blk_qc_t ret = BLK_QC_T_NONE;
1795         int srcu_idx;
1796         struct dm_table *map;
1797
1798         if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
1799                 /*
1800                  * We are called with a live reference on q_usage_counter, but
1801                  * that one will be released as soon as we return.  Grab an
1802                  * extra one as blk_mq_make_request expects to be able to
1803                  * consume a reference (which lives until the request is freed
1804                  * in case a request is allocated).
1805                  */
1806                 percpu_ref_get(&q->q_usage_counter);
1807                 return blk_mq_make_request(q, bio);
1808         }
1809
1810         map = dm_get_live_table(md, &srcu_idx);
1811
1812         /* if we're suspended, we have to queue this io for later */
1813         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1814                 dm_put_live_table(md, srcu_idx);
1815
1816                 if (!(bio->bi_opf & REQ_RAHEAD))
1817                         queue_io(md, bio);
1818                 else
1819                         bio_io_error(bio);
1820                 return ret;
1821         }
1822
1823         ret = dm_process_bio(md, map, bio);
1824
1825         dm_put_live_table(md, srcu_idx);
1826         return ret;
1827 }
1828
1829 static int dm_any_congested(void *congested_data, int bdi_bits)
1830 {
1831         int r = bdi_bits;
1832         struct mapped_device *md = congested_data;
1833         struct dm_table *map;
1834
1835         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1836                 if (dm_request_based(md)) {
1837                         /*
1838                          * With request-based DM we only need to check the
1839                          * top-level queue for congestion.
1840                          */
1841                         struct backing_dev_info *bdi = md->queue->backing_dev_info;
1842                         r = bdi->wb.congested->state & bdi_bits;
1843                 } else {
1844                         map = dm_get_live_table_fast(md);
1845                         if (map)
1846                                 r = dm_table_any_congested(map, bdi_bits);
1847                         dm_put_live_table_fast(md);
1848                 }
1849         }
1850
1851         return r;
1852 }
1853
1854 /*-----------------------------------------------------------------
1855  * An IDR is used to keep track of allocated minor numbers.
1856  *---------------------------------------------------------------*/
1857 static void free_minor(int minor)
1858 {
1859         spin_lock(&_minor_lock);
1860         idr_remove(&_minor_idr, minor);
1861         spin_unlock(&_minor_lock);
1862 }
1863
1864 /*
1865  * See if the device with a specific minor # is free.
1866  */
1867 static int specific_minor(int minor)
1868 {
1869         int r;
1870
1871         if (minor >= (1 << MINORBITS))
1872                 return -EINVAL;
1873
1874         idr_preload(GFP_KERNEL);
1875         spin_lock(&_minor_lock);
1876
1877         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1878
1879         spin_unlock(&_minor_lock);
1880         idr_preload_end();
1881         if (r < 0)
1882                 return r == -ENOSPC ? -EBUSY : r;
1883         return 0;
1884 }
1885
1886 static int next_free_minor(int *minor)
1887 {
1888         int r;
1889
1890         idr_preload(GFP_KERNEL);
1891         spin_lock(&_minor_lock);
1892
1893         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1894
1895         spin_unlock(&_minor_lock);
1896         idr_preload_end();
1897         if (r < 0)
1898                 return r;
1899         *minor = r;
1900         return 0;
1901 }
1902
1903 static const struct block_device_operations dm_blk_dops;
1904 static const struct dax_operations dm_dax_ops;
1905
1906 static void dm_wq_work(struct work_struct *work);
1907
1908 static void cleanup_mapped_device(struct mapped_device *md)
1909 {
1910         if (md->wq)
1911                 destroy_workqueue(md->wq);
1912         bioset_exit(&md->bs);
1913         bioset_exit(&md->io_bs);
1914
1915         if (md->dax_dev) {
1916                 kill_dax(md->dax_dev);
1917                 put_dax(md->dax_dev);
1918                 md->dax_dev = NULL;
1919         }
1920
1921         if (md->disk) {
1922                 spin_lock(&_minor_lock);
1923                 md->disk->private_data = NULL;
1924                 spin_unlock(&_minor_lock);
1925                 del_gendisk(md->disk);
1926                 put_disk(md->disk);
1927         }
1928
1929         if (md->queue)
1930                 blk_cleanup_queue(md->queue);
1931
1932         cleanup_srcu_struct(&md->io_barrier);
1933
1934         if (md->bdev) {
1935                 bdput(md->bdev);
1936                 md->bdev = NULL;
1937         }
1938
1939         mutex_destroy(&md->suspend_lock);
1940         mutex_destroy(&md->type_lock);
1941         mutex_destroy(&md->table_devices_lock);
1942
1943         dm_mq_cleanup_mapped_device(md);
1944 }
1945
1946 /*
1947  * Allocate and initialise a blank device with a given minor.
1948  */
1949 static struct mapped_device *alloc_dev(int minor)
1950 {
1951         int r, numa_node_id = dm_get_numa_node();
1952         struct mapped_device *md;
1953         void *old_md;
1954
1955         md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1956         if (!md) {
1957                 DMWARN("unable to allocate device, out of memory.");
1958                 return NULL;
1959         }
1960
1961         if (!try_module_get(THIS_MODULE))
1962                 goto bad_module_get;
1963
1964         /* get a minor number for the dev */
1965         if (minor == DM_ANY_MINOR)
1966                 r = next_free_minor(&minor);
1967         else
1968                 r = specific_minor(minor);
1969         if (r < 0)
1970                 goto bad_minor;
1971
1972         r = init_srcu_struct(&md->io_barrier);
1973         if (r < 0)
1974                 goto bad_io_barrier;
1975
1976         md->numa_node_id = numa_node_id;
1977         md->init_tio_pdu = false;
1978         md->type = DM_TYPE_NONE;
1979         mutex_init(&md->suspend_lock);
1980         mutex_init(&md->type_lock);
1981         mutex_init(&md->table_devices_lock);
1982         spin_lock_init(&md->deferred_lock);
1983         atomic_set(&md->holders, 1);
1984         atomic_set(&md->open_count, 0);
1985         atomic_set(&md->event_nr, 0);
1986         atomic_set(&md->uevent_seq, 0);
1987         INIT_LIST_HEAD(&md->uevent_list);
1988         INIT_LIST_HEAD(&md->table_devices);
1989         spin_lock_init(&md->uevent_lock);
1990
1991         /*
1992          * default to bio-based required ->make_request_fn until DM
1993          * table is loaded and md->type established. If request-based
1994          * table is loaded: blk-mq will override accordingly.
1995          */
1996         md->queue = blk_alloc_queue(dm_make_request, numa_node_id);
1997         if (!md->queue)
1998                 goto bad;
1999         md->queue->queuedata = md;
2000
2001         md->disk = alloc_disk_node(1, md->numa_node_id);
2002         if (!md->disk)
2003                 goto bad;
2004
2005         init_waitqueue_head(&md->wait);
2006         INIT_WORK(&md->work, dm_wq_work);
2007         init_waitqueue_head(&md->eventq);
2008         init_completion(&md->kobj_holder.completion);
2009
2010         md->disk->major = _major;
2011         md->disk->first_minor = minor;
2012         md->disk->fops = &dm_blk_dops;
2013         md->disk->queue = md->queue;
2014         md->disk->private_data = md;
2015         sprintf(md->disk->disk_name, "dm-%d", minor);
2016
2017         if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
2018                 md->dax_dev = alloc_dax(md, md->disk->disk_name,
2019                                         &dm_dax_ops, 0);
2020                 if (IS_ERR(md->dax_dev))
2021                         goto bad;
2022         }
2023
2024         add_disk_no_queue_reg(md->disk);
2025         format_dev_t(md->name, MKDEV(_major, minor));
2026
2027         md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2028         if (!md->wq)
2029                 goto bad;
2030
2031         md->bdev = bdget_disk(md->disk, 0);
2032         if (!md->bdev)
2033                 goto bad;
2034
2035         dm_stats_init(&md->stats);
2036
2037         /* Populate the mapping, nobody knows we exist yet */
2038         spin_lock(&_minor_lock);
2039         old_md = idr_replace(&_minor_idr, md, minor);
2040         spin_unlock(&_minor_lock);
2041
2042         BUG_ON(old_md != MINOR_ALLOCED);
2043
2044         return md;
2045
2046 bad:
2047         cleanup_mapped_device(md);
2048 bad_io_barrier:
2049         free_minor(minor);
2050 bad_minor:
2051         module_put(THIS_MODULE);
2052 bad_module_get:
2053         kvfree(md);
2054         return NULL;
2055 }
2056
2057 static void unlock_fs(struct mapped_device *md);
2058
2059 static void free_dev(struct mapped_device *md)
2060 {
2061         int minor = MINOR(disk_devt(md->disk));
2062
2063         unlock_fs(md);
2064
2065         cleanup_mapped_device(md);
2066
2067         free_table_devices(&md->table_devices);
2068         dm_stats_cleanup(&md->stats);
2069         free_minor(minor);
2070
2071         module_put(THIS_MODULE);
2072         kvfree(md);
2073 }
2074
2075 static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2076 {
2077         struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2078         int ret = 0;
2079
2080         if (dm_table_bio_based(t)) {
2081                 /*
2082                  * The md may already have mempools that need changing.
2083                  * If so, reload bioset because front_pad may have changed
2084                  * because a different table was loaded.
2085                  */
2086                 bioset_exit(&md->bs);
2087                 bioset_exit(&md->io_bs);
2088
2089         } else if (bioset_initialized(&md->bs)) {
2090                 /*
2091                  * There's no need to reload with request-based dm
2092                  * because the size of front_pad doesn't change.
2093                  * Note for future: If you are to reload bioset,
2094                  * prep-ed requests in the queue may refer
2095                  * to bio from the old bioset, so you must walk
2096                  * through the queue to unprep.
2097                  */
2098                 goto out;
2099         }
2100
2101         BUG_ON(!p ||
2102                bioset_initialized(&md->bs) ||
2103                bioset_initialized(&md->io_bs));
2104
2105         ret = bioset_init_from_src(&md->bs, &p->bs);
2106         if (ret)
2107                 goto out;
2108         ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2109         if (ret)
2110                 bioset_exit(&md->bs);
2111 out:
2112         /* mempool bind completed, no longer need any mempools in the table */
2113         dm_table_free_md_mempools(t);
2114         return ret;
2115 }
2116
2117 /*
2118  * Bind a table to the device.
2119  */
2120 static void event_callback(void *context)
2121 {
2122         unsigned long flags;
2123         LIST_HEAD(uevents);
2124         struct mapped_device *md = (struct mapped_device *) context;
2125
2126         spin_lock_irqsave(&md->uevent_lock, flags);
2127         list_splice_init(&md->uevent_list, &uevents);
2128         spin_unlock_irqrestore(&md->uevent_lock, flags);
2129
2130         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2131
2132         atomic_inc(&md->event_nr);
2133         wake_up(&md->eventq);
2134         dm_issue_global_event();
2135 }
2136
2137 /*
2138  * Protected by md->suspend_lock obtained by dm_swap_table().
2139  */
2140 static void __set_size(struct mapped_device *md, sector_t size)
2141 {
2142         lockdep_assert_held(&md->suspend_lock);
2143
2144         set_capacity(md->disk, size);
2145
2146         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2147 }
2148
2149 /*
2150  * Returns old map, which caller must destroy.
2151  */
2152 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2153                                struct queue_limits *limits)
2154 {
2155         struct dm_table *old_map;
2156         struct request_queue *q = md->queue;
2157         bool request_based = dm_table_request_based(t);
2158         sector_t size;
2159         int ret;
2160
2161         lockdep_assert_held(&md->suspend_lock);
2162
2163         size = dm_table_get_size(t);
2164
2165         /*
2166          * Wipe any geometry if the size of the table changed.
2167          */
2168         if (size != dm_get_size(md))
2169                 memset(&md->geometry, 0, sizeof(md->geometry));
2170
2171         __set_size(md, size);
2172
2173         dm_table_event_callback(t, event_callback, md);
2174
2175         /*
2176          * The queue hasn't been stopped yet, if the old table type wasn't
2177          * for request-based during suspension.  So stop it to prevent
2178          * I/O mapping before resume.
2179          * This must be done before setting the queue restrictions,
2180          * because request-based dm may be run just after the setting.
2181          */
2182         if (request_based)
2183                 dm_stop_queue(q);
2184
2185         if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2186                 /*
2187                  * Leverage the fact that request-based DM targets and
2188                  * NVMe bio based targets are immutable singletons
2189                  * - used to optimize both dm_request_fn and dm_mq_queue_rq;
2190                  *   and __process_bio.
2191                  */
2192                 md->immutable_target = dm_table_get_immutable_target(t);
2193         }
2194
2195         ret = __bind_mempools(md, t);
2196         if (ret) {
2197                 old_map = ERR_PTR(ret);
2198                 goto out;
2199         }
2200
2201         old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2202         rcu_assign_pointer(md->map, (void *)t);
2203         md->immutable_target_type = dm_table_get_immutable_target_type(t);
2204
2205         dm_table_set_restrictions(t, q, limits);
2206         if (old_map)
2207                 dm_sync_table(md);
2208
2209 out:
2210         return old_map;
2211 }
2212
2213 /*
2214  * Returns unbound table for the caller to free.
2215  */
2216 static struct dm_table *__unbind(struct mapped_device *md)
2217 {
2218         struct dm_table *map = rcu_dereference_protected(md->map, 1);
2219
2220         if (!map)
2221                 return NULL;
2222
2223         dm_table_event_callback(map, NULL, NULL);
2224         RCU_INIT_POINTER(md->map, NULL);
2225         dm_sync_table(md);
2226
2227         return map;
2228 }
2229
2230 /*
2231  * Constructor for a new device.
2232  */
2233 int dm_create(int minor, struct mapped_device **result)
2234 {
2235         int r;
2236         struct mapped_device *md;
2237
2238         md = alloc_dev(minor);
2239         if (!md)
2240                 return -ENXIO;
2241
2242         r = dm_sysfs_init(md);
2243         if (r) {
2244                 free_dev(md);
2245                 return r;
2246         }
2247
2248         *result = md;
2249         return 0;
2250 }
2251
2252 /*
2253  * Functions to manage md->type.
2254  * All are required to hold md->type_lock.
2255  */
2256 void dm_lock_md_type(struct mapped_device *md)
2257 {
2258         mutex_lock(&md->type_lock);
2259 }
2260
2261 void dm_unlock_md_type(struct mapped_device *md)
2262 {
2263         mutex_unlock(&md->type_lock);
2264 }
2265
2266 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2267 {
2268         BUG_ON(!mutex_is_locked(&md->type_lock));
2269         md->type = type;
2270 }
2271
2272 enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2273 {
2274         return md->type;
2275 }
2276
2277 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2278 {
2279         return md->immutable_target_type;
2280 }
2281
2282 /*
2283  * The queue_limits are only valid as long as you have a reference
2284  * count on 'md'.
2285  */
2286 struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2287 {
2288         BUG_ON(!atomic_read(&md->holders));
2289         return &md->queue->limits;
2290 }
2291 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2292
2293 static void dm_init_congested_fn(struct mapped_device *md)
2294 {
2295         md->queue->backing_dev_info->congested_data = md;
2296         md->queue->backing_dev_info->congested_fn = dm_any_congested;
2297 }
2298
2299 /*
2300  * Setup the DM device's queue based on md's type
2301  */
2302 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2303 {
2304         int r;
2305         struct queue_limits limits;
2306         enum dm_queue_mode type = dm_get_md_type(md);
2307
2308         switch (type) {
2309         case DM_TYPE_REQUEST_BASED:
2310                 r = dm_mq_init_request_queue(md, t);
2311                 if (r) {
2312                         DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2313                         return r;
2314                 }
2315                 dm_init_congested_fn(md);
2316                 break;
2317         case DM_TYPE_BIO_BASED:
2318         case DM_TYPE_DAX_BIO_BASED:
2319         case DM_TYPE_NVME_BIO_BASED:
2320                 dm_init_congested_fn(md);
2321                 break;
2322         case DM_TYPE_NONE:
2323                 WARN_ON_ONCE(true);
2324                 break;
2325         }
2326
2327         r = dm_calculate_queue_limits(t, &limits);
2328         if (r) {
2329                 DMERR("Cannot calculate initial queue limits");
2330                 return r;
2331         }
2332         dm_table_set_restrictions(t, md->queue, &limits);
2333         blk_register_queue(md->disk);
2334
2335         return 0;
2336 }
2337
2338 struct mapped_device *dm_get_md(dev_t dev)
2339 {
2340         struct mapped_device *md;
2341         unsigned minor = MINOR(dev);
2342
2343         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2344                 return NULL;
2345
2346         spin_lock(&_minor_lock);
2347
2348         md = idr_find(&_minor_idr, minor);
2349         if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2350             test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2351                 md = NULL;
2352                 goto out;
2353         }
2354         dm_get(md);
2355 out:
2356         spin_unlock(&_minor_lock);
2357
2358         return md;
2359 }
2360 EXPORT_SYMBOL_GPL(dm_get_md);
2361
2362 void *dm_get_mdptr(struct mapped_device *md)
2363 {
2364         return md->interface_ptr;
2365 }
2366
2367 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2368 {
2369         md->interface_ptr = ptr;
2370 }
2371
2372 void dm_get(struct mapped_device *md)
2373 {
2374         atomic_inc(&md->holders);
2375         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2376 }
2377
2378 int dm_hold(struct mapped_device *md)
2379 {
2380         spin_lock(&_minor_lock);
2381         if (test_bit(DMF_FREEING, &md->flags)) {
2382                 spin_unlock(&_minor_lock);
2383                 return -EBUSY;
2384         }
2385         dm_get(md);
2386         spin_unlock(&_minor_lock);
2387         return 0;
2388 }
2389 EXPORT_SYMBOL_GPL(dm_hold);
2390
2391 const char *dm_device_name(struct mapped_device *md)
2392 {
2393         return md->name;
2394 }
2395 EXPORT_SYMBOL_GPL(dm_device_name);
2396
2397 static void __dm_destroy(struct mapped_device *md, bool wait)
2398 {
2399         struct dm_table *map;
2400         int srcu_idx;
2401
2402         might_sleep();
2403
2404         spin_lock(&_minor_lock);
2405         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2406         set_bit(DMF_FREEING, &md->flags);
2407         spin_unlock(&_minor_lock);
2408
2409         blk_set_queue_dying(md->queue);
2410
2411         /*
2412          * Take suspend_lock so that presuspend and postsuspend methods
2413          * do not race with internal suspend.
2414          */
2415         mutex_lock(&md->suspend_lock);
2416         map = dm_get_live_table(md, &srcu_idx);
2417         if (!dm_suspended_md(md)) {
2418                 dm_table_presuspend_targets(map);
2419                 set_bit(DMF_SUSPENDED, &md->flags);
2420                 dm_table_postsuspend_targets(map);
2421         }
2422         /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2423         dm_put_live_table(md, srcu_idx);
2424         mutex_unlock(&md->suspend_lock);
2425
2426         /*
2427          * Rare, but there may be I/O requests still going to complete,
2428          * for example.  Wait for all references to disappear.
2429          * No one should increment the reference count of the mapped_device,
2430          * after the mapped_device state becomes DMF_FREEING.
2431          */
2432         if (wait)
2433                 while (atomic_read(&md->holders))
2434                         msleep(1);
2435         else if (atomic_read(&md->holders))
2436                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2437                        dm_device_name(md), atomic_read(&md->holders));
2438
2439         dm_sysfs_exit(md);
2440         dm_table_destroy(__unbind(md));
2441         free_dev(md);
2442 }
2443
2444 void dm_destroy(struct mapped_device *md)
2445 {
2446         __dm_destroy(md, true);
2447 }
2448
2449 void dm_destroy_immediate(struct mapped_device *md)
2450 {
2451         __dm_destroy(md, false);
2452 }
2453
2454 void dm_put(struct mapped_device *md)
2455 {
2456         atomic_dec(&md->holders);
2457 }
2458 EXPORT_SYMBOL_GPL(dm_put);
2459
2460 static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2461 {
2462         int r = 0;
2463         DEFINE_WAIT(wait);
2464
2465         while (1) {
2466                 prepare_to_wait(&md->wait, &wait, task_state);
2467
2468                 if (!md_in_flight(md))
2469                         break;
2470
2471                 if (signal_pending_state(task_state, current)) {
2472                         r = -EINTR;
2473                         break;
2474                 }
2475
2476                 io_schedule();
2477         }
2478         finish_wait(&md->wait, &wait);
2479
2480         return r;
2481 }
2482
2483 /*
2484  * Process the deferred bios
2485  */
2486 static void dm_wq_work(struct work_struct *work)
2487 {
2488         struct mapped_device *md = container_of(work, struct mapped_device,
2489                                                 work);
2490         struct bio *c;
2491         int srcu_idx;
2492         struct dm_table *map;
2493
2494         map = dm_get_live_table(md, &srcu_idx);
2495
2496         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2497                 spin_lock_irq(&md->deferred_lock);
2498                 c = bio_list_pop(&md->deferred);
2499                 spin_unlock_irq(&md->deferred_lock);
2500
2501                 if (!c)
2502                         break;
2503
2504                 if (dm_request_based(md))
2505                         (void) generic_make_request(c);
2506                 else
2507                         (void) dm_process_bio(md, map, c);
2508         }
2509
2510         dm_put_live_table(md, srcu_idx);
2511 }
2512
2513 static void dm_queue_flush(struct mapped_device *md)
2514 {
2515         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2516         smp_mb__after_atomic();
2517         queue_work(md->wq, &md->work);
2518 }
2519
2520 /*
2521  * Swap in a new table, returning the old one for the caller to destroy.
2522  */
2523 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2524 {
2525         struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2526         struct queue_limits limits;
2527         int r;
2528
2529         mutex_lock(&md->suspend_lock);
2530
2531         /* device must be suspended */
2532         if (!dm_suspended_md(md))
2533                 goto out;
2534
2535         /*
2536          * If the new table has no data devices, retain the existing limits.
2537          * This helps multipath with queue_if_no_path if all paths disappear,
2538          * then new I/O is queued based on these limits, and then some paths
2539          * reappear.
2540          */
2541         if (dm_table_has_no_data_devices(table)) {
2542                 live_map = dm_get_live_table_fast(md);
2543                 if (live_map)
2544                         limits = md->queue->limits;
2545                 dm_put_live_table_fast(md);
2546         }
2547
2548         if (!live_map) {
2549                 r = dm_calculate_queue_limits(table, &limits);
2550                 if (r) {
2551                         map = ERR_PTR(r);
2552                         goto out;
2553                 }
2554         }
2555
2556         map = __bind(md, table, &limits);
2557         dm_issue_global_event();
2558
2559 out:
2560         mutex_unlock(&md->suspend_lock);
2561         return map;
2562 }
2563
2564 /*
2565  * Functions to lock and unlock any filesystem running on the
2566  * device.
2567  */
2568 static int lock_fs(struct mapped_device *md)
2569 {
2570         int r;
2571
2572         WARN_ON(md->frozen_sb);
2573
2574         md->frozen_sb = freeze_bdev(md->bdev);
2575         if (IS_ERR(md->frozen_sb)) {
2576                 r = PTR_ERR(md->frozen_sb);
2577                 md->frozen_sb = NULL;
2578                 return r;
2579         }
2580
2581         set_bit(DMF_FROZEN, &md->flags);
2582
2583         return 0;
2584 }
2585
2586 static void unlock_fs(struct mapped_device *md)
2587 {
2588         if (!test_bit(DMF_FROZEN, &md->flags))
2589                 return;
2590
2591         thaw_bdev(md->bdev, md->frozen_sb);
2592         md->frozen_sb = NULL;
2593         clear_bit(DMF_FROZEN, &md->flags);
2594 }
2595
2596 /*
2597  * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2598  * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2599  * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2600  *
2601  * If __dm_suspend returns 0, the device is completely quiescent
2602  * now. There is no request-processing activity. All new requests
2603  * are being added to md->deferred list.
2604  */
2605 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2606                         unsigned suspend_flags, long task_state,
2607                         int dmf_suspended_flag)
2608 {
2609         bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2610         bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2611         int r;
2612
2613         lockdep_assert_held(&md->suspend_lock);
2614
2615         /*
2616          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2617          * This flag is cleared before dm_suspend returns.
2618          */
2619         if (noflush)
2620                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2621         else
2622                 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2623
2624         /*
2625          * This gets reverted if there's an error later and the targets
2626          * provide the .presuspend_undo hook.
2627          */
2628         dm_table_presuspend_targets(map);
2629
2630         /*
2631          * Flush I/O to the device.
2632          * Any I/O submitted after lock_fs() may not be flushed.
2633          * noflush takes precedence over do_lockfs.
2634          * (lock_fs() flushes I/Os and waits for them to complete.)
2635          */
2636         if (!noflush && do_lockfs) {
2637                 r = lock_fs(md);
2638                 if (r) {
2639                         dm_table_presuspend_undo_targets(map);
2640                         return r;
2641                 }
2642         }
2643
2644         /*
2645          * Here we must make sure that no processes are submitting requests
2646          * to target drivers i.e. no one may be executing
2647          * __split_and_process_bio. This is called from dm_request and
2648          * dm_wq_work.
2649          *
2650          * To get all processes out of __split_and_process_bio in dm_request,
2651          * we take the write lock. To prevent any process from reentering
2652          * __split_and_process_bio from dm_request and quiesce the thread
2653          * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2654          * flush_workqueue(md->wq).
2655          */
2656         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2657         if (map)
2658                 synchronize_srcu(&md->io_barrier);
2659
2660         /*
2661          * Stop md->queue before flushing md->wq in case request-based
2662          * dm defers requests to md->wq from md->queue.
2663          */
2664         if (dm_request_based(md))
2665                 dm_stop_queue(md->queue);
2666
2667         flush_workqueue(md->wq);
2668
2669         /*
2670          * At this point no more requests are entering target request routines.
2671          * We call dm_wait_for_completion to wait for all existing requests
2672          * to finish.
2673          */
2674         r = dm_wait_for_completion(md, task_state);
2675         if (!r)
2676                 set_bit(dmf_suspended_flag, &md->flags);
2677
2678         if (noflush)
2679                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2680         if (map)
2681                 synchronize_srcu(&md->io_barrier);
2682
2683         /* were we interrupted ? */
2684         if (r < 0) {
2685                 dm_queue_flush(md);
2686
2687                 if (dm_request_based(md))
2688                         dm_start_queue(md->queue);
2689
2690                 unlock_fs(md);
2691                 dm_table_presuspend_undo_targets(map);
2692                 /* pushback list is already flushed, so skip flush */
2693         }
2694
2695         return r;
2696 }
2697
2698 /*
2699  * We need to be able to change a mapping table under a mounted
2700  * filesystem.  For example we might want to move some data in
2701  * the background.  Before the table can be swapped with
2702  * dm_bind_table, dm_suspend must be called to flush any in
2703  * flight bios and ensure that any further io gets deferred.
2704  */
2705 /*
2706  * Suspend mechanism in request-based dm.
2707  *
2708  * 1. Flush all I/Os by lock_fs() if needed.
2709  * 2. Stop dispatching any I/O by stopping the request_queue.
2710  * 3. Wait for all in-flight I/Os to be completed or requeued.
2711  *
2712  * To abort suspend, start the request_queue.
2713  */
2714 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2715 {
2716         struct dm_table *map = NULL;
2717         int r = 0;
2718
2719 retry:
2720         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2721
2722         if (dm_suspended_md(md)) {
2723                 r = -EINVAL;
2724                 goto out_unlock;
2725         }
2726
2727         if (dm_suspended_internally_md(md)) {
2728                 /* already internally suspended, wait for internal resume */
2729                 mutex_unlock(&md->suspend_lock);
2730                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2731                 if (r)
2732                         return r;
2733                 goto retry;
2734         }
2735
2736         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2737
2738         r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2739         if (r)
2740                 goto out_unlock;
2741
2742         dm_table_postsuspend_targets(map);
2743
2744 out_unlock:
2745         mutex_unlock(&md->suspend_lock);
2746         return r;
2747 }
2748
2749 static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2750 {
2751         if (map) {
2752                 int r = dm_table_resume_targets(map);
2753                 if (r)
2754                         return r;
2755         }
2756
2757         dm_queue_flush(md);
2758
2759         /*
2760          * Flushing deferred I/Os must be done after targets are resumed
2761          * so that mapping of targets can work correctly.
2762          * Request-based dm is queueing the deferred I/Os in its request_queue.
2763          */
2764         if (dm_request_based(md))
2765                 dm_start_queue(md->queue);
2766
2767         unlock_fs(md);
2768
2769         return 0;
2770 }
2771
2772 int dm_resume(struct mapped_device *md)
2773 {
2774         int r;
2775         struct dm_table *map = NULL;
2776
2777 retry:
2778         r = -EINVAL;
2779         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2780
2781         if (!dm_suspended_md(md))
2782                 goto out;
2783
2784         if (dm_suspended_internally_md(md)) {
2785                 /* already internally suspended, wait for internal resume */
2786                 mutex_unlock(&md->suspend_lock);
2787                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2788                 if (r)
2789                         return r;
2790                 goto retry;
2791         }
2792
2793         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2794         if (!map || !dm_table_get_size(map))
2795                 goto out;
2796
2797         r = __dm_resume(md, map);
2798         if (r)
2799                 goto out;
2800
2801         clear_bit(DMF_SUSPENDED, &md->flags);
2802 out:
2803         mutex_unlock(&md->suspend_lock);
2804
2805         return r;
2806 }
2807
2808 /*
2809  * Internal suspend/resume works like userspace-driven suspend. It waits
2810  * until all bios finish and prevents issuing new bios to the target drivers.
2811  * It may be used only from the kernel.
2812  */
2813
2814 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2815 {
2816         struct dm_table *map = NULL;
2817
2818         lockdep_assert_held(&md->suspend_lock);
2819
2820         if (md->internal_suspend_count++)
2821                 return; /* nested internal suspend */
2822
2823         if (dm_suspended_md(md)) {
2824                 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2825                 return; /* nest suspend */
2826         }
2827
2828         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2829
2830         /*
2831          * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2832          * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2833          * would require changing .presuspend to return an error -- avoid this
2834          * until there is a need for more elaborate variants of internal suspend.
2835          */
2836         (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2837                             DMF_SUSPENDED_INTERNALLY);
2838
2839         dm_table_postsuspend_targets(map);
2840 }
2841
2842 static void __dm_internal_resume(struct mapped_device *md)
2843 {
2844         BUG_ON(!md->internal_suspend_count);
2845
2846         if (--md->internal_suspend_count)
2847                 return; /* resume from nested internal suspend */
2848
2849         if (dm_suspended_md(md))
2850                 goto done; /* resume from nested suspend */
2851
2852         /*
2853          * NOTE: existing callers don't need to call dm_table_resume_targets
2854          * (which may fail -- so best to avoid it for now by passing NULL map)
2855          */
2856         (void) __dm_resume(md, NULL);
2857
2858 done:
2859         clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2860         smp_mb__after_atomic();
2861         wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2862 }
2863
2864 void dm_internal_suspend_noflush(struct mapped_device *md)
2865 {
2866         mutex_lock(&md->suspend_lock);
2867         __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2868         mutex_unlock(&md->suspend_lock);
2869 }
2870 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2871
2872 void dm_internal_resume(struct mapped_device *md)
2873 {
2874         mutex_lock(&md->suspend_lock);
2875         __dm_internal_resume(md);
2876         mutex_unlock(&md->suspend_lock);
2877 }
2878 EXPORT_SYMBOL_GPL(dm_internal_resume);
2879
2880 /*
2881  * Fast variants of internal suspend/resume hold md->suspend_lock,
2882  * which prevents interaction with userspace-driven suspend.
2883  */
2884
2885 void dm_internal_suspend_fast(struct mapped_device *md)
2886 {
2887         mutex_lock(&md->suspend_lock);
2888         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2889                 return;
2890
2891         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2892         synchronize_srcu(&md->io_barrier);
2893         flush_workqueue(md->wq);
2894         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2895 }
2896 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2897
2898 void dm_internal_resume_fast(struct mapped_device *md)
2899 {
2900         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2901                 goto done;
2902
2903         dm_queue_flush(md);
2904
2905 done:
2906         mutex_unlock(&md->suspend_lock);
2907 }
2908 EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2909
2910 /*-----------------------------------------------------------------
2911  * Event notification.
2912  *---------------------------------------------------------------*/
2913 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2914                        unsigned cookie)
2915 {
2916         char udev_cookie[DM_COOKIE_LENGTH];
2917         char *envp[] = { udev_cookie, NULL };
2918
2919         if (!cookie)
2920                 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2921         else {
2922                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2923                          DM_COOKIE_ENV_VAR_NAME, cookie);
2924                 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2925                                           action, envp);
2926         }
2927 }
2928
2929 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2930 {
2931         return atomic_add_return(1, &md->uevent_seq);
2932 }
2933
2934 uint32_t dm_get_event_nr(struct mapped_device *md)
2935 {
2936         return atomic_read(&md->event_nr);
2937 }
2938
2939 int dm_wait_event(struct mapped_device *md, int event_nr)
2940 {
2941         return wait_event_interruptible(md->eventq,
2942                         (event_nr != atomic_read(&md->event_nr)));
2943 }
2944
2945 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2946 {
2947         unsigned long flags;
2948
2949         spin_lock_irqsave(&md->uevent_lock, flags);
2950         list_add(elist, &md->uevent_list);
2951         spin_unlock_irqrestore(&md->uevent_lock, flags);
2952 }
2953
2954 /*
2955  * The gendisk is only valid as long as you have a reference
2956  * count on 'md'.
2957  */
2958 struct gendisk *dm_disk(struct mapped_device *md)
2959 {
2960         return md->disk;
2961 }
2962 EXPORT_SYMBOL_GPL(dm_disk);
2963
2964 struct kobject *dm_kobject(struct mapped_device *md)
2965 {
2966         return &md->kobj_holder.kobj;
2967 }
2968
2969 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2970 {
2971         struct mapped_device *md;
2972
2973         md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2974
2975         spin_lock(&_minor_lock);
2976         if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2977                 md = NULL;
2978                 goto out;
2979         }
2980         dm_get(md);
2981 out:
2982         spin_unlock(&_minor_lock);
2983
2984         return md;
2985 }
2986
2987 int dm_suspended_md(struct mapped_device *md)
2988 {
2989         return test_bit(DMF_SUSPENDED, &md->flags);
2990 }
2991
2992 int dm_suspended_internally_md(struct mapped_device *md)
2993 {
2994         return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2995 }
2996
2997 int dm_test_deferred_remove_flag(struct mapped_device *md)
2998 {
2999         return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3000 }
3001
3002 int dm_suspended(struct dm_target *ti)
3003 {
3004         return dm_suspended_md(dm_table_get_md(ti->table));
3005 }
3006 EXPORT_SYMBOL_GPL(dm_suspended);
3007
3008 int dm_noflush_suspending(struct dm_target *ti)
3009 {
3010         return __noflush_suspending(dm_table_get_md(ti->table));
3011 }
3012 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3013
3014 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
3015                                             unsigned integrity, unsigned per_io_data_size,
3016                                             unsigned min_pool_size)
3017 {
3018         struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
3019         unsigned int pool_size = 0;
3020         unsigned int front_pad, io_front_pad;
3021         int ret;
3022
3023         if (!pools)
3024                 return NULL;
3025
3026         switch (type) {
3027         case DM_TYPE_BIO_BASED:
3028         case DM_TYPE_DAX_BIO_BASED:
3029         case DM_TYPE_NVME_BIO_BASED:
3030                 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
3031                 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3032                 io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3033                 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3034                 if (ret)
3035                         goto out;
3036                 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3037                         goto out;
3038                 break;
3039         case DM_TYPE_REQUEST_BASED:
3040                 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3041                 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3042                 /* per_io_data_size is used for blk-mq pdu at queue allocation */
3043                 break;
3044         default:
3045                 BUG();
3046         }
3047
3048         ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3049         if (ret)
3050                 goto out;
3051
3052         if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3053                 goto out;
3054
3055         return pools;
3056
3057 out:
3058         dm_free_md_mempools(pools);
3059
3060         return NULL;
3061 }
3062
3063 void dm_free_md_mempools(struct dm_md_mempools *pools)
3064 {
3065         if (!pools)
3066                 return;
3067
3068         bioset_exit(&pools->bs);
3069         bioset_exit(&pools->io_bs);
3070
3071         kfree(pools);
3072 }
3073
3074 struct dm_pr {
3075         u64     old_key;
3076         u64     new_key;
3077         u32     flags;
3078         bool    fail_early;
3079 };
3080
3081 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3082                       void *data)
3083 {
3084         struct mapped_device *md = bdev->bd_disk->private_data;
3085         struct dm_table *table;
3086         struct dm_target *ti;
3087         int ret = -ENOTTY, srcu_idx;
3088
3089         table = dm_get_live_table(md, &srcu_idx);
3090         if (!table || !dm_table_get_size(table))
3091                 goto out;
3092
3093         /* We only support devices that have a single target */
3094         if (dm_table_get_num_targets(table) != 1)
3095                 goto out;
3096         ti = dm_table_get_target(table, 0);
3097
3098         ret = -EINVAL;
3099         if (!ti->type->iterate_devices)
3100                 goto out;
3101
3102         ret = ti->type->iterate_devices(ti, fn, data);
3103 out:
3104         dm_put_live_table(md, srcu_idx);
3105         return ret;
3106 }
3107
3108 /*
3109  * For register / unregister we need to manually call out to every path.
3110  */
3111 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3112                             sector_t start, sector_t len, void *data)
3113 {
3114         struct dm_pr *pr = data;
3115         const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3116
3117         if (!ops || !ops->pr_register)
3118                 return -EOPNOTSUPP;
3119         return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3120 }
3121
3122 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3123                           u32 flags)
3124 {
3125         struct dm_pr pr = {
3126                 .old_key        = old_key,
3127                 .new_key        = new_key,
3128                 .flags          = flags,
3129                 .fail_early     = true,
3130         };
3131         int ret;
3132
3133         ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3134         if (ret && new_key) {
3135                 /* unregister all paths if we failed to register any path */
3136                 pr.old_key = new_key;
3137                 pr.new_key = 0;
3138                 pr.flags = 0;
3139                 pr.fail_early = false;
3140                 dm_call_pr(bdev, __dm_pr_register, &pr);
3141         }
3142
3143         return ret;
3144 }
3145
3146 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3147                          u32 flags)
3148 {
3149         struct mapped_device *md = bdev->bd_disk->private_data;
3150         const struct pr_ops *ops;
3151         int r, srcu_idx;
3152
3153         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3154         if (r < 0)
3155                 goto out;
3156
3157         ops = bdev->bd_disk->fops->pr_ops;
3158         if (ops && ops->pr_reserve)
3159                 r = ops->pr_reserve(bdev, key, type, flags);
3160         else
3161                 r = -EOPNOTSUPP;
3162 out:
3163         dm_unprepare_ioctl(md, srcu_idx);
3164         return r;
3165 }
3166
3167 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3168 {
3169         struct mapped_device *md = bdev->bd_disk->private_data;
3170         const struct pr_ops *ops;
3171         int r, srcu_idx;
3172
3173         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3174         if (r < 0)
3175                 goto out;
3176
3177         ops = bdev->bd_disk->fops->pr_ops;
3178         if (ops && ops->pr_release)
3179                 r = ops->pr_release(bdev, key, type);
3180         else
3181                 r = -EOPNOTSUPP;
3182 out:
3183         dm_unprepare_ioctl(md, srcu_idx);
3184         return r;
3185 }
3186
3187 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3188                          enum pr_type type, bool abort)
3189 {
3190         struct mapped_device *md = bdev->bd_disk->private_data;
3191         const struct pr_ops *ops;
3192         int r, srcu_idx;
3193
3194         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3195         if (r < 0)
3196                 goto out;
3197
3198         ops = bdev->bd_disk->fops->pr_ops;
3199         if (ops && ops->pr_preempt)
3200                 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3201         else
3202                 r = -EOPNOTSUPP;
3203 out:
3204         dm_unprepare_ioctl(md, srcu_idx);
3205         return r;
3206 }
3207
3208 static int dm_pr_clear(struct block_device *bdev, u64 key)
3209 {
3210         struct mapped_device *md = bdev->bd_disk->private_data;
3211         const struct pr_ops *ops;
3212         int r, srcu_idx;
3213
3214         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3215         if (r < 0)
3216                 goto out;
3217
3218         ops = bdev->bd_disk->fops->pr_ops;
3219         if (ops && ops->pr_clear)
3220                 r = ops->pr_clear(bdev, key);
3221         else
3222                 r = -EOPNOTSUPP;
3223 out:
3224         dm_unprepare_ioctl(md, srcu_idx);
3225         return r;
3226 }
3227
3228 static const struct pr_ops dm_pr_ops = {
3229         .pr_register    = dm_pr_register,
3230         .pr_reserve     = dm_pr_reserve,
3231         .pr_release     = dm_pr_release,
3232         .pr_preempt     = dm_pr_preempt,
3233         .pr_clear       = dm_pr_clear,
3234 };
3235
3236 static const struct block_device_operations dm_blk_dops = {
3237         .open = dm_blk_open,
3238         .release = dm_blk_close,
3239         .ioctl = dm_blk_ioctl,
3240         .getgeo = dm_blk_getgeo,
3241         .report_zones = dm_blk_report_zones,
3242         .pr_ops = &dm_pr_ops,
3243         .owner = THIS_MODULE
3244 };
3245
3246 static const struct dax_operations dm_dax_ops = {
3247         .direct_access = dm_dax_direct_access,
3248         .dax_supported = dm_dax_supported,
3249         .copy_from_iter = dm_dax_copy_from_iter,
3250         .copy_to_iter = dm_dax_copy_to_iter,
3251         .zero_page_range = dm_dax_zero_page_range,
3252 };
3253
3254 /*
3255  * module hooks
3256  */
3257 module_init(dm_init);
3258 module_exit(dm_exit);
3259
3260 module_param(major, uint, 0);
3261 MODULE_PARM_DESC(major, "The major number of the device mapper");
3262
3263 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3264 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3265
3266 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3267 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3268
3269 MODULE_DESCRIPTION(DM_NAME " driver");
3270 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3271 MODULE_LICENSE("GPL");