nvme-pci: Add quirk for Teamgroup MP33 SSD
[linux-2.6-microblaze.git] / drivers / nvme / host / pci.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVM Express device driver
4  * Copyright (c) 2011-2014, Intel Corporation.
5  */
6
7 #include <linux/acpi.h>
8 #include <linux/async.h>
9 #include <linux/blkdev.h>
10 #include <linux/blk-mq.h>
11 #include <linux/blk-mq-pci.h>
12 #include <linux/blk-integrity.h>
13 #include <linux/dmi.h>
14 #include <linux/init.h>
15 #include <linux/interrupt.h>
16 #include <linux/io.h>
17 #include <linux/kstrtox.h>
18 #include <linux/memremap.h>
19 #include <linux/mm.h>
20 #include <linux/module.h>
21 #include <linux/mutex.h>
22 #include <linux/once.h>
23 #include <linux/pci.h>
24 #include <linux/suspend.h>
25 #include <linux/t10-pi.h>
26 #include <linux/types.h>
27 #include <linux/io-64-nonatomic-lo-hi.h>
28 #include <linux/io-64-nonatomic-hi-lo.h>
29 #include <linux/sed-opal.h>
30 #include <linux/pci-p2pdma.h>
31
32 #include "trace.h"
33 #include "nvme.h"
34
35 #define SQ_SIZE(q)      ((q)->q_depth << (q)->sqes)
36 #define CQ_SIZE(q)      ((q)->q_depth * sizeof(struct nvme_completion))
37
38 #define SGES_PER_PAGE   (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
39
40 /*
41  * These can be higher, but we need to ensure that any command doesn't
42  * require an sg allocation that needs more than a page of data.
43  */
44 #define NVME_MAX_KB_SZ  8192
45 #define NVME_MAX_SEGS   128
46 #define NVME_MAX_NR_ALLOCATIONS 5
47
48 static int use_threaded_interrupts;
49 module_param(use_threaded_interrupts, int, 0444);
50
51 static bool use_cmb_sqes = true;
52 module_param(use_cmb_sqes, bool, 0444);
53 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
54
55 static unsigned int max_host_mem_size_mb = 128;
56 module_param(max_host_mem_size_mb, uint, 0444);
57 MODULE_PARM_DESC(max_host_mem_size_mb,
58         "Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
59
60 static unsigned int sgl_threshold = SZ_32K;
61 module_param(sgl_threshold, uint, 0644);
62 MODULE_PARM_DESC(sgl_threshold,
63                 "Use SGLs when average request segment size is larger or equal to "
64                 "this size. Use 0 to disable SGLs.");
65
66 #define NVME_PCI_MIN_QUEUE_SIZE 2
67 #define NVME_PCI_MAX_QUEUE_SIZE 4095
68 static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
69 static const struct kernel_param_ops io_queue_depth_ops = {
70         .set = io_queue_depth_set,
71         .get = param_get_uint,
72 };
73
74 static unsigned int io_queue_depth = 1024;
75 module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
76 MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096");
77
78 static int io_queue_count_set(const char *val, const struct kernel_param *kp)
79 {
80         unsigned int n;
81         int ret;
82
83         ret = kstrtouint(val, 10, &n);
84         if (ret != 0 || n > num_possible_cpus())
85                 return -EINVAL;
86         return param_set_uint(val, kp);
87 }
88
89 static const struct kernel_param_ops io_queue_count_ops = {
90         .set = io_queue_count_set,
91         .get = param_get_uint,
92 };
93
94 static unsigned int write_queues;
95 module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
96 MODULE_PARM_DESC(write_queues,
97         "Number of queues to use for writes. If not set, reads and writes "
98         "will share a queue set.");
99
100 static unsigned int poll_queues;
101 module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
102 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
103
104 static bool noacpi;
105 module_param(noacpi, bool, 0444);
106 MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
107
108 struct nvme_dev;
109 struct nvme_queue;
110
111 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
112 static void nvme_delete_io_queues(struct nvme_dev *dev);
113 static void nvme_update_attrs(struct nvme_dev *dev);
114
115 /*
116  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
117  */
118 struct nvme_dev {
119         struct nvme_queue *queues;
120         struct blk_mq_tag_set tagset;
121         struct blk_mq_tag_set admin_tagset;
122         u32 __iomem *dbs;
123         struct device *dev;
124         struct dma_pool *prp_page_pool;
125         struct dma_pool *prp_small_pool;
126         unsigned online_queues;
127         unsigned max_qid;
128         unsigned io_queues[HCTX_MAX_TYPES];
129         unsigned int num_vecs;
130         u32 q_depth;
131         int io_sqes;
132         u32 db_stride;
133         void __iomem *bar;
134         unsigned long bar_mapped_size;
135         struct mutex shutdown_lock;
136         bool subsystem;
137         u64 cmb_size;
138         bool cmb_use_sqes;
139         u32 cmbsz;
140         u32 cmbloc;
141         struct nvme_ctrl ctrl;
142         u32 last_ps;
143         bool hmb;
144
145         mempool_t *iod_mempool;
146
147         /* shadow doorbell buffer support: */
148         __le32 *dbbuf_dbs;
149         dma_addr_t dbbuf_dbs_dma_addr;
150         __le32 *dbbuf_eis;
151         dma_addr_t dbbuf_eis_dma_addr;
152
153         /* host memory buffer support: */
154         u64 host_mem_size;
155         u32 nr_host_mem_descs;
156         dma_addr_t host_mem_descs_dma;
157         struct nvme_host_mem_buf_desc *host_mem_descs;
158         void **host_mem_desc_bufs;
159         unsigned int nr_allocated_queues;
160         unsigned int nr_write_queues;
161         unsigned int nr_poll_queues;
162 };
163
164 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
165 {
166         return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
167                         NVME_PCI_MAX_QUEUE_SIZE);
168 }
169
170 static inline unsigned int sq_idx(unsigned int qid, u32 stride)
171 {
172         return qid * 2 * stride;
173 }
174
175 static inline unsigned int cq_idx(unsigned int qid, u32 stride)
176 {
177         return (qid * 2 + 1) * stride;
178 }
179
180 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
181 {
182         return container_of(ctrl, struct nvme_dev, ctrl);
183 }
184
185 /*
186  * An NVM Express queue.  Each device has at least two (one for admin
187  * commands and one for I/O commands).
188  */
189 struct nvme_queue {
190         struct nvme_dev *dev;
191         spinlock_t sq_lock;
192         void *sq_cmds;
193          /* only used for poll queues: */
194         spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
195         struct nvme_completion *cqes;
196         dma_addr_t sq_dma_addr;
197         dma_addr_t cq_dma_addr;
198         u32 __iomem *q_db;
199         u32 q_depth;
200         u16 cq_vector;
201         u16 sq_tail;
202         u16 last_sq_tail;
203         u16 cq_head;
204         u16 qid;
205         u8 cq_phase;
206         u8 sqes;
207         unsigned long flags;
208 #define NVMEQ_ENABLED           0
209 #define NVMEQ_SQ_CMB            1
210 #define NVMEQ_DELETE_ERROR      2
211 #define NVMEQ_POLLED            3
212         __le32 *dbbuf_sq_db;
213         __le32 *dbbuf_cq_db;
214         __le32 *dbbuf_sq_ei;
215         __le32 *dbbuf_cq_ei;
216         struct completion delete_done;
217 };
218
219 union nvme_descriptor {
220         struct nvme_sgl_desc    *sg_list;
221         __le64                  *prp_list;
222 };
223
224 /*
225  * The nvme_iod describes the data in an I/O.
226  *
227  * The sg pointer contains the list of PRP/SGL chunk allocations in addition
228  * to the actual struct scatterlist.
229  */
230 struct nvme_iod {
231         struct nvme_request req;
232         struct nvme_command cmd;
233         bool aborted;
234         s8 nr_allocations;      /* PRP list pool allocations. 0 means small
235                                    pool in use */
236         unsigned int dma_len;   /* length of single DMA segment mapping */
237         dma_addr_t first_dma;
238         dma_addr_t meta_dma;
239         struct sg_table sgt;
240         union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
241 };
242
243 static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
244 {
245         return dev->nr_allocated_queues * 8 * dev->db_stride;
246 }
247
248 static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
249 {
250         unsigned int mem_size = nvme_dbbuf_size(dev);
251
252         if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP))
253                 return;
254
255         if (dev->dbbuf_dbs) {
256                 /*
257                  * Clear the dbbuf memory so the driver doesn't observe stale
258                  * values from the previous instantiation.
259                  */
260                 memset(dev->dbbuf_dbs, 0, mem_size);
261                 memset(dev->dbbuf_eis, 0, mem_size);
262                 return;
263         }
264
265         dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
266                                             &dev->dbbuf_dbs_dma_addr,
267                                             GFP_KERNEL);
268         if (!dev->dbbuf_dbs)
269                 goto fail;
270         dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
271                                             &dev->dbbuf_eis_dma_addr,
272                                             GFP_KERNEL);
273         if (!dev->dbbuf_eis)
274                 goto fail_free_dbbuf_dbs;
275         return;
276
277 fail_free_dbbuf_dbs:
278         dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs,
279                           dev->dbbuf_dbs_dma_addr);
280         dev->dbbuf_dbs = NULL;
281 fail:
282         dev_warn(dev->dev, "unable to allocate dma for dbbuf\n");
283 }
284
285 static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
286 {
287         unsigned int mem_size = nvme_dbbuf_size(dev);
288
289         if (dev->dbbuf_dbs) {
290                 dma_free_coherent(dev->dev, mem_size,
291                                   dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
292                 dev->dbbuf_dbs = NULL;
293         }
294         if (dev->dbbuf_eis) {
295                 dma_free_coherent(dev->dev, mem_size,
296                                   dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
297                 dev->dbbuf_eis = NULL;
298         }
299 }
300
301 static void nvme_dbbuf_init(struct nvme_dev *dev,
302                             struct nvme_queue *nvmeq, int qid)
303 {
304         if (!dev->dbbuf_dbs || !qid)
305                 return;
306
307         nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
308         nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
309         nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
310         nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
311 }
312
313 static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
314 {
315         if (!nvmeq->qid)
316                 return;
317
318         nvmeq->dbbuf_sq_db = NULL;
319         nvmeq->dbbuf_cq_db = NULL;
320         nvmeq->dbbuf_sq_ei = NULL;
321         nvmeq->dbbuf_cq_ei = NULL;
322 }
323
324 static void nvme_dbbuf_set(struct nvme_dev *dev)
325 {
326         struct nvme_command c = { };
327         unsigned int i;
328
329         if (!dev->dbbuf_dbs)
330                 return;
331
332         c.dbbuf.opcode = nvme_admin_dbbuf;
333         c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
334         c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
335
336         if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
337                 dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
338                 /* Free memory and continue on */
339                 nvme_dbbuf_dma_free(dev);
340
341                 for (i = 1; i <= dev->online_queues; i++)
342                         nvme_dbbuf_free(&dev->queues[i]);
343         }
344 }
345
346 static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
347 {
348         return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
349 }
350
351 /* Update dbbuf and return true if an MMIO is required */
352 static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
353                                               volatile __le32 *dbbuf_ei)
354 {
355         if (dbbuf_db) {
356                 u16 old_value, event_idx;
357
358                 /*
359                  * Ensure that the queue is written before updating
360                  * the doorbell in memory
361                  */
362                 wmb();
363
364                 old_value = le32_to_cpu(*dbbuf_db);
365                 *dbbuf_db = cpu_to_le32(value);
366
367                 /*
368                  * Ensure that the doorbell is updated before reading the event
369                  * index from memory.  The controller needs to provide similar
370                  * ordering to ensure the envent index is updated before reading
371                  * the doorbell.
372                  */
373                 mb();
374
375                 event_idx = le32_to_cpu(*dbbuf_ei);
376                 if (!nvme_dbbuf_need_event(event_idx, value, old_value))
377                         return false;
378         }
379
380         return true;
381 }
382
383 /*
384  * Will slightly overestimate the number of pages needed.  This is OK
385  * as it only leads to a small amount of wasted memory for the lifetime of
386  * the I/O.
387  */
388 static int nvme_pci_npages_prp(void)
389 {
390         unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE;
391         unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
392         return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
393 }
394
395 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
396                                 unsigned int hctx_idx)
397 {
398         struct nvme_dev *dev = to_nvme_dev(data);
399         struct nvme_queue *nvmeq = &dev->queues[0];
400
401         WARN_ON(hctx_idx != 0);
402         WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
403
404         hctx->driver_data = nvmeq;
405         return 0;
406 }
407
408 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
409                           unsigned int hctx_idx)
410 {
411         struct nvme_dev *dev = to_nvme_dev(data);
412         struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
413
414         WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
415         hctx->driver_data = nvmeq;
416         return 0;
417 }
418
419 static int nvme_pci_init_request(struct blk_mq_tag_set *set,
420                 struct request *req, unsigned int hctx_idx,
421                 unsigned int numa_node)
422 {
423         struct nvme_dev *dev = to_nvme_dev(set->driver_data);
424         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
425
426         nvme_req(req)->ctrl = &dev->ctrl;
427         nvme_req(req)->cmd = &iod->cmd;
428         return 0;
429 }
430
431 static int queue_irq_offset(struct nvme_dev *dev)
432 {
433         /* if we have more than 1 vec, admin queue offsets us by 1 */
434         if (dev->num_vecs > 1)
435                 return 1;
436
437         return 0;
438 }
439
440 static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
441 {
442         struct nvme_dev *dev = to_nvme_dev(set->driver_data);
443         int i, qoff, offset;
444
445         offset = queue_irq_offset(dev);
446         for (i = 0, qoff = 0; i < set->nr_maps; i++) {
447                 struct blk_mq_queue_map *map = &set->map[i];
448
449                 map->nr_queues = dev->io_queues[i];
450                 if (!map->nr_queues) {
451                         BUG_ON(i == HCTX_TYPE_DEFAULT);
452                         continue;
453                 }
454
455                 /*
456                  * The poll queue(s) doesn't have an IRQ (and hence IRQ
457                  * affinity), so use the regular blk-mq cpu mapping
458                  */
459                 map->queue_offset = qoff;
460                 if (i != HCTX_TYPE_POLL && offset)
461                         blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
462                 else
463                         blk_mq_map_queues(map);
464                 qoff += map->nr_queues;
465                 offset += map->nr_queues;
466         }
467 }
468
469 /*
470  * Write sq tail if we are asked to, or if the next command would wrap.
471  */
472 static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
473 {
474         if (!write_sq) {
475                 u16 next_tail = nvmeq->sq_tail + 1;
476
477                 if (next_tail == nvmeq->q_depth)
478                         next_tail = 0;
479                 if (next_tail != nvmeq->last_sq_tail)
480                         return;
481         }
482
483         if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
484                         nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
485                 writel(nvmeq->sq_tail, nvmeq->q_db);
486         nvmeq->last_sq_tail = nvmeq->sq_tail;
487 }
488
489 static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq,
490                                     struct nvme_command *cmd)
491 {
492         memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
493                 absolute_pointer(cmd), sizeof(*cmd));
494         if (++nvmeq->sq_tail == nvmeq->q_depth)
495                 nvmeq->sq_tail = 0;
496 }
497
498 static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
499 {
500         struct nvme_queue *nvmeq = hctx->driver_data;
501
502         spin_lock(&nvmeq->sq_lock);
503         if (nvmeq->sq_tail != nvmeq->last_sq_tail)
504                 nvme_write_sq_db(nvmeq, true);
505         spin_unlock(&nvmeq->sq_lock);
506 }
507
508 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
509                                      int nseg)
510 {
511         struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
512         unsigned int avg_seg_size;
513
514         avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
515
516         if (!nvme_ctrl_sgl_supported(&dev->ctrl))
517                 return false;
518         if (!nvmeq->qid)
519                 return false;
520         if (!sgl_threshold || avg_seg_size < sgl_threshold)
521                 return false;
522         return true;
523 }
524
525 static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
526 {
527         const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
528         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
529         dma_addr_t dma_addr = iod->first_dma;
530         int i;
531
532         for (i = 0; i < iod->nr_allocations; i++) {
533                 __le64 *prp_list = iod->list[i].prp_list;
534                 dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
535
536                 dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
537                 dma_addr = next_dma_addr;
538         }
539 }
540
541 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
542 {
543         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
544
545         if (iod->dma_len) {
546                 dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
547                                rq_dma_dir(req));
548                 return;
549         }
550
551         WARN_ON_ONCE(!iod->sgt.nents);
552
553         dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
554
555         if (iod->nr_allocations == 0)
556                 dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
557                               iod->first_dma);
558         else if (iod->nr_allocations == 1)
559                 dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
560                               iod->first_dma);
561         else
562                 nvme_free_prps(dev, req);
563         mempool_free(iod->sgt.sgl, dev->iod_mempool);
564 }
565
566 static void nvme_print_sgl(struct scatterlist *sgl, int nents)
567 {
568         int i;
569         struct scatterlist *sg;
570
571         for_each_sg(sgl, sg, nents, i) {
572                 dma_addr_t phys = sg_phys(sg);
573                 pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
574                         "dma_address:%pad dma_length:%d\n",
575                         i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
576                         sg_dma_len(sg));
577         }
578 }
579
580 static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
581                 struct request *req, struct nvme_rw_command *cmnd)
582 {
583         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
584         struct dma_pool *pool;
585         int length = blk_rq_payload_bytes(req);
586         struct scatterlist *sg = iod->sgt.sgl;
587         int dma_len = sg_dma_len(sg);
588         u64 dma_addr = sg_dma_address(sg);
589         int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
590         __le64 *prp_list;
591         dma_addr_t prp_dma;
592         int nprps, i;
593
594         length -= (NVME_CTRL_PAGE_SIZE - offset);
595         if (length <= 0) {
596                 iod->first_dma = 0;
597                 goto done;
598         }
599
600         dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
601         if (dma_len) {
602                 dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
603         } else {
604                 sg = sg_next(sg);
605                 dma_addr = sg_dma_address(sg);
606                 dma_len = sg_dma_len(sg);
607         }
608
609         if (length <= NVME_CTRL_PAGE_SIZE) {
610                 iod->first_dma = dma_addr;
611                 goto done;
612         }
613
614         nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
615         if (nprps <= (256 / 8)) {
616                 pool = dev->prp_small_pool;
617                 iod->nr_allocations = 0;
618         } else {
619                 pool = dev->prp_page_pool;
620                 iod->nr_allocations = 1;
621         }
622
623         prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
624         if (!prp_list) {
625                 iod->nr_allocations = -1;
626                 return BLK_STS_RESOURCE;
627         }
628         iod->list[0].prp_list = prp_list;
629         iod->first_dma = prp_dma;
630         i = 0;
631         for (;;) {
632                 if (i == NVME_CTRL_PAGE_SIZE >> 3) {
633                         __le64 *old_prp_list = prp_list;
634                         prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
635                         if (!prp_list)
636                                 goto free_prps;
637                         iod->list[iod->nr_allocations++].prp_list = prp_list;
638                         prp_list[0] = old_prp_list[i - 1];
639                         old_prp_list[i - 1] = cpu_to_le64(prp_dma);
640                         i = 1;
641                 }
642                 prp_list[i++] = cpu_to_le64(dma_addr);
643                 dma_len -= NVME_CTRL_PAGE_SIZE;
644                 dma_addr += NVME_CTRL_PAGE_SIZE;
645                 length -= NVME_CTRL_PAGE_SIZE;
646                 if (length <= 0)
647                         break;
648                 if (dma_len > 0)
649                         continue;
650                 if (unlikely(dma_len < 0))
651                         goto bad_sgl;
652                 sg = sg_next(sg);
653                 dma_addr = sg_dma_address(sg);
654                 dma_len = sg_dma_len(sg);
655         }
656 done:
657         cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
658         cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
659         return BLK_STS_OK;
660 free_prps:
661         nvme_free_prps(dev, req);
662         return BLK_STS_RESOURCE;
663 bad_sgl:
664         WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
665                         "Invalid SGL for payload:%d nents:%d\n",
666                         blk_rq_payload_bytes(req), iod->sgt.nents);
667         return BLK_STS_IOERR;
668 }
669
670 static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
671                 struct scatterlist *sg)
672 {
673         sge->addr = cpu_to_le64(sg_dma_address(sg));
674         sge->length = cpu_to_le32(sg_dma_len(sg));
675         sge->type = NVME_SGL_FMT_DATA_DESC << 4;
676 }
677
678 static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
679                 dma_addr_t dma_addr, int entries)
680 {
681         sge->addr = cpu_to_le64(dma_addr);
682         sge->length = cpu_to_le32(entries * sizeof(*sge));
683         sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
684 }
685
686 static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
687                 struct request *req, struct nvme_rw_command *cmd)
688 {
689         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
690         struct dma_pool *pool;
691         struct nvme_sgl_desc *sg_list;
692         struct scatterlist *sg = iod->sgt.sgl;
693         unsigned int entries = iod->sgt.nents;
694         dma_addr_t sgl_dma;
695         int i = 0;
696
697         /* setting the transfer type as SGL */
698         cmd->flags = NVME_CMD_SGL_METABUF;
699
700         if (entries == 1) {
701                 nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
702                 return BLK_STS_OK;
703         }
704
705         if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
706                 pool = dev->prp_small_pool;
707                 iod->nr_allocations = 0;
708         } else {
709                 pool = dev->prp_page_pool;
710                 iod->nr_allocations = 1;
711         }
712
713         sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
714         if (!sg_list) {
715                 iod->nr_allocations = -1;
716                 return BLK_STS_RESOURCE;
717         }
718
719         iod->list[0].sg_list = sg_list;
720         iod->first_dma = sgl_dma;
721
722         nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
723         do {
724                 nvme_pci_sgl_set_data(&sg_list[i++], sg);
725                 sg = sg_next(sg);
726         } while (--entries > 0);
727
728         return BLK_STS_OK;
729 }
730
731 static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
732                 struct request *req, struct nvme_rw_command *cmnd,
733                 struct bio_vec *bv)
734 {
735         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
736         unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
737         unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
738
739         iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
740         if (dma_mapping_error(dev->dev, iod->first_dma))
741                 return BLK_STS_RESOURCE;
742         iod->dma_len = bv->bv_len;
743
744         cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
745         if (bv->bv_len > first_prp_len)
746                 cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
747         else
748                 cmnd->dptr.prp2 = 0;
749         return BLK_STS_OK;
750 }
751
752 static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
753                 struct request *req, struct nvme_rw_command *cmnd,
754                 struct bio_vec *bv)
755 {
756         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
757
758         iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
759         if (dma_mapping_error(dev->dev, iod->first_dma))
760                 return BLK_STS_RESOURCE;
761         iod->dma_len = bv->bv_len;
762
763         cmnd->flags = NVME_CMD_SGL_METABUF;
764         cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
765         cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
766         cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
767         return BLK_STS_OK;
768 }
769
770 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
771                 struct nvme_command *cmnd)
772 {
773         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
774         blk_status_t ret = BLK_STS_RESOURCE;
775         int rc;
776
777         if (blk_rq_nr_phys_segments(req) == 1) {
778                 struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
779                 struct bio_vec bv = req_bvec(req);
780
781                 if (!is_pci_p2pdma_page(bv.bv_page)) {
782                         if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
783                                 return nvme_setup_prp_simple(dev, req,
784                                                              &cmnd->rw, &bv);
785
786                         if (nvmeq->qid && sgl_threshold &&
787                             nvme_ctrl_sgl_supported(&dev->ctrl))
788                                 return nvme_setup_sgl_simple(dev, req,
789                                                              &cmnd->rw, &bv);
790                 }
791         }
792
793         iod->dma_len = 0;
794         iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
795         if (!iod->sgt.sgl)
796                 return BLK_STS_RESOURCE;
797         sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
798         iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl);
799         if (!iod->sgt.orig_nents)
800                 goto out_free_sg;
801
802         rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
803                              DMA_ATTR_NO_WARN);
804         if (rc) {
805                 if (rc == -EREMOTEIO)
806                         ret = BLK_STS_TARGET;
807                 goto out_free_sg;
808         }
809
810         if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
811                 ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
812         else
813                 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
814         if (ret != BLK_STS_OK)
815                 goto out_unmap_sg;
816         return BLK_STS_OK;
817
818 out_unmap_sg:
819         dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
820 out_free_sg:
821         mempool_free(iod->sgt.sgl, dev->iod_mempool);
822         return ret;
823 }
824
825 static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
826                 struct nvme_command *cmnd)
827 {
828         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
829
830         iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
831                         rq_dma_dir(req), 0);
832         if (dma_mapping_error(dev->dev, iod->meta_dma))
833                 return BLK_STS_IOERR;
834         cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
835         return BLK_STS_OK;
836 }
837
838 static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
839 {
840         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
841         blk_status_t ret;
842
843         iod->aborted = false;
844         iod->nr_allocations = -1;
845         iod->sgt.nents = 0;
846
847         ret = nvme_setup_cmd(req->q->queuedata, req);
848         if (ret)
849                 return ret;
850
851         if (blk_rq_nr_phys_segments(req)) {
852                 ret = nvme_map_data(dev, req, &iod->cmd);
853                 if (ret)
854                         goto out_free_cmd;
855         }
856
857         if (blk_integrity_rq(req)) {
858                 ret = nvme_map_metadata(dev, req, &iod->cmd);
859                 if (ret)
860                         goto out_unmap_data;
861         }
862
863         nvme_start_request(req);
864         return BLK_STS_OK;
865 out_unmap_data:
866         nvme_unmap_data(dev, req);
867 out_free_cmd:
868         nvme_cleanup_cmd(req);
869         return ret;
870 }
871
872 /*
873  * NOTE: ns is NULL when called on the admin queue.
874  */
875 static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
876                          const struct blk_mq_queue_data *bd)
877 {
878         struct nvme_queue *nvmeq = hctx->driver_data;
879         struct nvme_dev *dev = nvmeq->dev;
880         struct request *req = bd->rq;
881         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
882         blk_status_t ret;
883
884         /*
885          * We should not need to do this, but we're still using this to
886          * ensure we can drain requests on a dying queue.
887          */
888         if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
889                 return BLK_STS_IOERR;
890
891         if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
892                 return nvme_fail_nonready_command(&dev->ctrl, req);
893
894         ret = nvme_prep_rq(dev, req);
895         if (unlikely(ret))
896                 return ret;
897         spin_lock(&nvmeq->sq_lock);
898         nvme_sq_copy_cmd(nvmeq, &iod->cmd);
899         nvme_write_sq_db(nvmeq, bd->last);
900         spin_unlock(&nvmeq->sq_lock);
901         return BLK_STS_OK;
902 }
903
904 static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
905 {
906         spin_lock(&nvmeq->sq_lock);
907         while (!rq_list_empty(*rqlist)) {
908                 struct request *req = rq_list_pop(rqlist);
909                 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
910
911                 nvme_sq_copy_cmd(nvmeq, &iod->cmd);
912         }
913         nvme_write_sq_db(nvmeq, true);
914         spin_unlock(&nvmeq->sq_lock);
915 }
916
917 static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
918 {
919         /*
920          * We should not need to do this, but we're still using this to
921          * ensure we can drain requests on a dying queue.
922          */
923         if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
924                 return false;
925         if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
926                 return false;
927
928         req->mq_hctx->tags->rqs[req->tag] = req;
929         return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
930 }
931
932 static void nvme_queue_rqs(struct request **rqlist)
933 {
934         struct request *req, *next, *prev = NULL;
935         struct request *requeue_list = NULL;
936
937         rq_list_for_each_safe(rqlist, req, next) {
938                 struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
939
940                 if (!nvme_prep_rq_batch(nvmeq, req)) {
941                         /* detach 'req' and add to remainder list */
942                         rq_list_move(rqlist, &requeue_list, req, prev);
943
944                         req = prev;
945                         if (!req)
946                                 continue;
947                 }
948
949                 if (!next || req->mq_hctx != next->mq_hctx) {
950                         /* detach rest of list, and submit */
951                         req->rq_next = NULL;
952                         nvme_submit_cmds(nvmeq, rqlist);
953                         *rqlist = next;
954                         prev = NULL;
955                 } else
956                         prev = req;
957         }
958
959         *rqlist = requeue_list;
960 }
961
962 static __always_inline void nvme_pci_unmap_rq(struct request *req)
963 {
964         struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
965         struct nvme_dev *dev = nvmeq->dev;
966
967         if (blk_integrity_rq(req)) {
968                 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
969
970                 dma_unmap_page(dev->dev, iod->meta_dma,
971                                rq_integrity_vec(req)->bv_len, rq_data_dir(req));
972         }
973
974         if (blk_rq_nr_phys_segments(req))
975                 nvme_unmap_data(dev, req);
976 }
977
978 static void nvme_pci_complete_rq(struct request *req)
979 {
980         nvme_pci_unmap_rq(req);
981         nvme_complete_rq(req);
982 }
983
984 static void nvme_pci_complete_batch(struct io_comp_batch *iob)
985 {
986         nvme_complete_batch(iob, nvme_pci_unmap_rq);
987 }
988
989 /* We read the CQE phase first to check if the rest of the entry is valid */
990 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
991 {
992         struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];
993
994         return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
995 }
996
997 static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
998 {
999         u16 head = nvmeq->cq_head;
1000
1001         if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
1002                                               nvmeq->dbbuf_cq_ei))
1003                 writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
1004 }
1005
1006 static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
1007 {
1008         if (!nvmeq->qid)
1009                 return nvmeq->dev->admin_tagset.tags[0];
1010         return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
1011 }
1012
1013 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
1014                                    struct io_comp_batch *iob, u16 idx)
1015 {
1016         struct nvme_completion *cqe = &nvmeq->cqes[idx];
1017         __u16 command_id = READ_ONCE(cqe->command_id);
1018         struct request *req;
1019
1020         /*
1021          * AEN requests are special as they don't time out and can
1022          * survive any kind of queue freeze and often don't respond to
1023          * aborts.  We don't even bother to allocate a struct request
1024          * for them but rather special case them here.
1025          */
1026         if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
1027                 nvme_complete_async_event(&nvmeq->dev->ctrl,
1028                                 cqe->status, &cqe->result);
1029                 return;
1030         }
1031
1032         req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
1033         if (unlikely(!req)) {
1034                 dev_warn(nvmeq->dev->ctrl.device,
1035                         "invalid id %d completed on queue %d\n",
1036                         command_id, le16_to_cpu(cqe->sq_id));
1037                 return;
1038         }
1039
1040         trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
1041         if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
1042             !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
1043                                         nvme_pci_complete_batch))
1044                 nvme_pci_complete_rq(req);
1045 }
1046
1047 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
1048 {
1049         u32 tmp = nvmeq->cq_head + 1;
1050
1051         if (tmp == nvmeq->q_depth) {
1052                 nvmeq->cq_head = 0;
1053                 nvmeq->cq_phase ^= 1;
1054         } else {
1055                 nvmeq->cq_head = tmp;
1056         }
1057 }
1058
1059 static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
1060                                struct io_comp_batch *iob)
1061 {
1062         int found = 0;
1063
1064         while (nvme_cqe_pending(nvmeq)) {
1065                 found++;
1066                 /*
1067                  * load-load control dependency between phase and the rest of
1068                  * the cqe requires a full read memory barrier
1069                  */
1070                 dma_rmb();
1071                 nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
1072                 nvme_update_cq_head(nvmeq);
1073         }
1074
1075         if (found)
1076                 nvme_ring_cq_doorbell(nvmeq);
1077         return found;
1078 }
1079
1080 static irqreturn_t nvme_irq(int irq, void *data)
1081 {
1082         struct nvme_queue *nvmeq = data;
1083         DEFINE_IO_COMP_BATCH(iob);
1084
1085         if (nvme_poll_cq(nvmeq, &iob)) {
1086                 if (!rq_list_empty(iob.req_list))
1087                         nvme_pci_complete_batch(&iob);
1088                 return IRQ_HANDLED;
1089         }
1090         return IRQ_NONE;
1091 }
1092
1093 static irqreturn_t nvme_irq_check(int irq, void *data)
1094 {
1095         struct nvme_queue *nvmeq = data;
1096
1097         if (nvme_cqe_pending(nvmeq))
1098                 return IRQ_WAKE_THREAD;
1099         return IRQ_NONE;
1100 }
1101
1102 /*
1103  * Poll for completions for any interrupt driven queue
1104  * Can be called from any context.
1105  */
1106 static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
1107 {
1108         struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
1109
1110         WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
1111
1112         disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1113         nvme_poll_cq(nvmeq, NULL);
1114         enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1115 }
1116
1117 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
1118 {
1119         struct nvme_queue *nvmeq = hctx->driver_data;
1120         bool found;
1121
1122         if (!nvme_cqe_pending(nvmeq))
1123                 return 0;
1124
1125         spin_lock(&nvmeq->cq_poll_lock);
1126         found = nvme_poll_cq(nvmeq, iob);
1127         spin_unlock(&nvmeq->cq_poll_lock);
1128
1129         return found;
1130 }
1131
1132 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
1133 {
1134         struct nvme_dev *dev = to_nvme_dev(ctrl);
1135         struct nvme_queue *nvmeq = &dev->queues[0];
1136         struct nvme_command c = { };
1137
1138         c.common.opcode = nvme_admin_async_event;
1139         c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1140
1141         spin_lock(&nvmeq->sq_lock);
1142         nvme_sq_copy_cmd(nvmeq, &c);
1143         nvme_write_sq_db(nvmeq, true);
1144         spin_unlock(&nvmeq->sq_lock);
1145 }
1146
1147 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1148 {
1149         struct nvme_command c = { };
1150
1151         c.delete_queue.opcode = opcode;
1152         c.delete_queue.qid = cpu_to_le16(id);
1153
1154         return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1155 }
1156
1157 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1158                 struct nvme_queue *nvmeq, s16 vector)
1159 {
1160         struct nvme_command c = { };
1161         int flags = NVME_QUEUE_PHYS_CONTIG;
1162
1163         if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
1164                 flags |= NVME_CQ_IRQ_ENABLED;
1165
1166         /*
1167          * Note: we (ab)use the fact that the prp fields survive if no data
1168          * is attached to the request.
1169          */
1170         c.create_cq.opcode = nvme_admin_create_cq;
1171         c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
1172         c.create_cq.cqid = cpu_to_le16(qid);
1173         c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1174         c.create_cq.cq_flags = cpu_to_le16(flags);
1175         c.create_cq.irq_vector = cpu_to_le16(vector);
1176
1177         return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1178 }
1179
1180 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1181                                                 struct nvme_queue *nvmeq)
1182 {
1183         struct nvme_ctrl *ctrl = &dev->ctrl;
1184         struct nvme_command c = { };
1185         int flags = NVME_QUEUE_PHYS_CONTIG;
1186
1187         /*
1188          * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
1189          * set. Since URGENT priority is zeroes, it makes all queues
1190          * URGENT.
1191          */
1192         if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
1193                 flags |= NVME_SQ_PRIO_MEDIUM;
1194
1195         /*
1196          * Note: we (ab)use the fact that the prp fields survive if no data
1197          * is attached to the request.
1198          */
1199         c.create_sq.opcode = nvme_admin_create_sq;
1200         c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
1201         c.create_sq.sqid = cpu_to_le16(qid);
1202         c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1203         c.create_sq.sq_flags = cpu_to_le16(flags);
1204         c.create_sq.cqid = cpu_to_le16(qid);
1205
1206         return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1207 }
1208
1209 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
1210 {
1211         return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
1212 }
1213
1214 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
1215 {
1216         return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
1217 }
1218
1219 static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
1220 {
1221         struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
1222
1223         dev_warn(nvmeq->dev->ctrl.device,
1224                  "Abort status: 0x%x", nvme_req(req)->status);
1225         atomic_inc(&nvmeq->dev->ctrl.abort_limit);
1226         blk_mq_free_request(req);
1227         return RQ_END_IO_NONE;
1228 }
1229
1230 static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
1231 {
1232         /* If true, indicates loss of adapter communication, possibly by a
1233          * NVMe Subsystem reset.
1234          */
1235         bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
1236
1237         /* If there is a reset/reinit ongoing, we shouldn't reset again. */
1238         switch (dev->ctrl.state) {
1239         case NVME_CTRL_RESETTING:
1240         case NVME_CTRL_CONNECTING:
1241                 return false;
1242         default:
1243                 break;
1244         }
1245
1246         /* We shouldn't reset unless the controller is on fatal error state
1247          * _or_ if we lost the communication with it.
1248          */
1249         if (!(csts & NVME_CSTS_CFS) && !nssro)
1250                 return false;
1251
1252         return true;
1253 }
1254
1255 static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
1256 {
1257         /* Read a config register to help see what died. */
1258         u16 pci_status;
1259         int result;
1260
1261         result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
1262                                       &pci_status);
1263         if (result == PCIBIOS_SUCCESSFUL)
1264                 dev_warn(dev->ctrl.device,
1265                          "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
1266                          csts, pci_status);
1267         else
1268                 dev_warn(dev->ctrl.device,
1269                          "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
1270                          csts, result);
1271
1272         if (csts != ~0)
1273                 return;
1274
1275         dev_warn(dev->ctrl.device,
1276                  "Does your device have a faulty power saving mode enabled?\n");
1277         dev_warn(dev->ctrl.device,
1278                  "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n");
1279 }
1280
1281 static enum blk_eh_timer_return nvme_timeout(struct request *req)
1282 {
1283         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
1284         struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
1285         struct nvme_dev *dev = nvmeq->dev;
1286         struct request *abort_req;
1287         struct nvme_command cmd = { };
1288         u32 csts = readl(dev->bar + NVME_REG_CSTS);
1289
1290         /* If PCI error recovery process is happening, we cannot reset or
1291          * the recovery mechanism will surely fail.
1292          */
1293         mb();
1294         if (pci_channel_offline(to_pci_dev(dev->dev)))
1295                 return BLK_EH_RESET_TIMER;
1296
1297         /*
1298          * Reset immediately if the controller is failed
1299          */
1300         if (nvme_should_reset(dev, csts)) {
1301                 nvme_warn_reset(dev, csts);
1302                 nvme_dev_disable(dev, false);
1303                 nvme_reset_ctrl(&dev->ctrl);
1304                 return BLK_EH_DONE;
1305         }
1306
1307         /*
1308          * Did we miss an interrupt?
1309          */
1310         if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
1311                 nvme_poll(req->mq_hctx, NULL);
1312         else
1313                 nvme_poll_irqdisable(nvmeq);
1314
1315         if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
1316                 dev_warn(dev->ctrl.device,
1317                          "I/O %d QID %d timeout, completion polled\n",
1318                          req->tag, nvmeq->qid);
1319                 return BLK_EH_DONE;
1320         }
1321
1322         /*
1323          * Shutdown immediately if controller times out while starting. The
1324          * reset work will see the pci device disabled when it gets the forced
1325          * cancellation error. All outstanding requests are completed on
1326          * shutdown, so we return BLK_EH_DONE.
1327          */
1328         switch (dev->ctrl.state) {
1329         case NVME_CTRL_CONNECTING:
1330                 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
1331                 fallthrough;
1332         case NVME_CTRL_DELETING:
1333                 dev_warn_ratelimited(dev->ctrl.device,
1334                          "I/O %d QID %d timeout, disable controller\n",
1335                          req->tag, nvmeq->qid);
1336                 nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1337                 nvme_dev_disable(dev, true);
1338                 return BLK_EH_DONE;
1339         case NVME_CTRL_RESETTING:
1340                 return BLK_EH_RESET_TIMER;
1341         default:
1342                 break;
1343         }
1344
1345         /*
1346          * Shutdown the controller immediately and schedule a reset if the
1347          * command was already aborted once before and still hasn't been
1348          * returned to the driver, or if this is the admin queue.
1349          */
1350         if (!nvmeq->qid || iod->aborted) {
1351                 dev_warn(dev->ctrl.device,
1352                          "I/O %d QID %d timeout, reset controller\n",
1353                          req->tag, nvmeq->qid);
1354                 nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1355                 nvme_dev_disable(dev, false);
1356                 nvme_reset_ctrl(&dev->ctrl);
1357
1358                 return BLK_EH_DONE;
1359         }
1360
1361         if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1362                 atomic_inc(&dev->ctrl.abort_limit);
1363                 return BLK_EH_RESET_TIMER;
1364         }
1365         iod->aborted = true;
1366
1367         cmd.abort.opcode = nvme_admin_abort_cmd;
1368         cmd.abort.cid = nvme_cid(req);
1369         cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
1370
1371         dev_warn(nvmeq->dev->ctrl.device,
1372                 "I/O %d (%s) QID %d timeout, aborting\n",
1373                  req->tag,
1374                  nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
1375                  nvmeq->qid);
1376
1377         abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
1378                                          BLK_MQ_REQ_NOWAIT);
1379         if (IS_ERR(abort_req)) {
1380                 atomic_inc(&dev->ctrl.abort_limit);
1381                 return BLK_EH_RESET_TIMER;
1382         }
1383         nvme_init_request(abort_req, &cmd);
1384
1385         abort_req->end_io = abort_endio;
1386         abort_req->end_io_data = NULL;
1387         blk_execute_rq_nowait(abort_req, false);
1388
1389         /*
1390          * The aborted req will be completed on receiving the abort req.
1391          * We enable the timer again. If hit twice, it'll cause a device reset,
1392          * as the device then is in a faulty state.
1393          */
1394         return BLK_EH_RESET_TIMER;
1395 }
1396
1397 static void nvme_free_queue(struct nvme_queue *nvmeq)
1398 {
1399         dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
1400                                 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1401         if (!nvmeq->sq_cmds)
1402                 return;
1403
1404         if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1405                 pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1406                                 nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1407         } else {
1408                 dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
1409                                 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1410         }
1411 }
1412
1413 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1414 {
1415         int i;
1416
1417         for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
1418                 dev->ctrl.queue_count--;
1419                 nvme_free_queue(&dev->queues[i]);
1420         }
1421 }
1422
1423 static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid)
1424 {
1425         struct nvme_queue *nvmeq = &dev->queues[qid];
1426
1427         if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
1428                 return;
1429
1430         /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1431         mb();
1432
1433         nvmeq->dev->online_queues--;
1434         if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1435                 nvme_quiesce_admin_queue(&nvmeq->dev->ctrl);
1436         if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
1437                 pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq);
1438 }
1439
1440 static void nvme_suspend_io_queues(struct nvme_dev *dev)
1441 {
1442         int i;
1443
1444         for (i = dev->ctrl.queue_count - 1; i > 0; i--)
1445                 nvme_suspend_queue(dev, i);
1446 }
1447
1448 /*
1449  * Called only on a device that has been disabled and after all other threads
1450  * that can check this device's completion queues have synced, except
1451  * nvme_poll(). This is the last chance for the driver to see a natural
1452  * completion before nvme_cancel_request() terminates all incomplete requests.
1453  */
1454 static void nvme_reap_pending_cqes(struct nvme_dev *dev)
1455 {
1456         int i;
1457
1458         for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
1459                 spin_lock(&dev->queues[i].cq_poll_lock);
1460                 nvme_poll_cq(&dev->queues[i], NULL);
1461                 spin_unlock(&dev->queues[i].cq_poll_lock);
1462         }
1463 }
1464
1465 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
1466                                 int entry_size)
1467 {
1468         int q_depth = dev->q_depth;
1469         unsigned q_size_aligned = roundup(q_depth * entry_size,
1470                                           NVME_CTRL_PAGE_SIZE);
1471
1472         if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1473                 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1474
1475                 mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
1476                 q_depth = div_u64(mem_per_q, entry_size);
1477
1478                 /*
1479                  * Ensure the reduced q_depth is above some threshold where it
1480                  * would be better to map queues in system memory with the
1481                  * original depth
1482                  */
1483                 if (q_depth < 64)
1484                         return -ENOMEM;
1485         }
1486
1487         return q_depth;
1488 }
1489
1490 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1491                                 int qid)
1492 {
1493         struct pci_dev *pdev = to_pci_dev(dev->dev);
1494
1495         if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1496                 nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
1497                 if (nvmeq->sq_cmds) {
1498                         nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
1499                                                         nvmeq->sq_cmds);
1500                         if (nvmeq->sq_dma_addr) {
1501                                 set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
1502                                 return 0;
1503                         }
1504
1505                         pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1506                 }
1507         }
1508
1509         nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
1510                                 &nvmeq->sq_dma_addr, GFP_KERNEL);
1511         if (!nvmeq->sq_cmds)
1512                 return -ENOMEM;
1513         return 0;
1514 }
1515
1516 static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
1517 {
1518         struct nvme_queue *nvmeq = &dev->queues[qid];
1519
1520         if (dev->ctrl.queue_count > qid)
1521                 return 0;
1522
1523         nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
1524         nvmeq->q_depth = depth;
1525         nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
1526                                          &nvmeq->cq_dma_addr, GFP_KERNEL);
1527         if (!nvmeq->cqes)
1528                 goto free_nvmeq;
1529
1530         if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
1531                 goto free_cqdma;
1532
1533         nvmeq->dev = dev;
1534         spin_lock_init(&nvmeq->sq_lock);
1535         spin_lock_init(&nvmeq->cq_poll_lock);
1536         nvmeq->cq_head = 0;
1537         nvmeq->cq_phase = 1;
1538         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1539         nvmeq->qid = qid;
1540         dev->ctrl.queue_count++;
1541
1542         return 0;
1543
1544  free_cqdma:
1545         dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
1546                           nvmeq->cq_dma_addr);
1547  free_nvmeq:
1548         return -ENOMEM;
1549 }
1550
1551 static int queue_request_irq(struct nvme_queue *nvmeq)
1552 {
1553         struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
1554         int nr = nvmeq->dev->ctrl.instance;
1555
1556         if (use_threaded_interrupts) {
1557                 return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
1558                                 nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
1559         } else {
1560                 return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
1561                                 NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
1562         }
1563 }
1564
1565 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
1566 {
1567         struct nvme_dev *dev = nvmeq->dev;
1568
1569         nvmeq->sq_tail = 0;
1570         nvmeq->last_sq_tail = 0;
1571         nvmeq->cq_head = 0;
1572         nvmeq->cq_phase = 1;
1573         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1574         memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
1575         nvme_dbbuf_init(dev, nvmeq, qid);
1576         dev->online_queues++;
1577         wmb(); /* ensure the first interrupt sees the initialization */
1578 }
1579
1580 /*
1581  * Try getting shutdown_lock while setting up IO queues.
1582  */
1583 static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
1584 {
1585         /*
1586          * Give up if the lock is being held by nvme_dev_disable.
1587          */
1588         if (!mutex_trylock(&dev->shutdown_lock))
1589                 return -ENODEV;
1590
1591         /*
1592          * Controller is in wrong state, fail early.
1593          */
1594         if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
1595                 mutex_unlock(&dev->shutdown_lock);
1596                 return -ENODEV;
1597         }
1598
1599         return 0;
1600 }
1601
1602 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1603 {
1604         struct nvme_dev *dev = nvmeq->dev;
1605         int result;
1606         u16 vector = 0;
1607
1608         clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
1609
1610         /*
1611          * A queue's vector matches the queue identifier unless the controller
1612          * has only one vector available.
1613          */
1614         if (!polled)
1615                 vector = dev->num_vecs == 1 ? 0 : qid;
1616         else
1617                 set_bit(NVMEQ_POLLED, &nvmeq->flags);
1618
1619         result = adapter_alloc_cq(dev, qid, nvmeq, vector);
1620         if (result)
1621                 return result;
1622
1623         result = adapter_alloc_sq(dev, qid, nvmeq);
1624         if (result < 0)
1625                 return result;
1626         if (result)
1627                 goto release_cq;
1628
1629         nvmeq->cq_vector = vector;
1630
1631         result = nvme_setup_io_queues_trylock(dev);
1632         if (result)
1633                 return result;
1634         nvme_init_queue(nvmeq, qid);
1635         if (!polled) {
1636                 result = queue_request_irq(nvmeq);
1637                 if (result < 0)
1638                         goto release_sq;
1639         }
1640
1641         set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1642         mutex_unlock(&dev->shutdown_lock);
1643         return result;
1644
1645 release_sq:
1646         dev->online_queues--;
1647         mutex_unlock(&dev->shutdown_lock);
1648         adapter_delete_sq(dev, qid);
1649 release_cq:
1650         adapter_delete_cq(dev, qid);
1651         return result;
1652 }
1653
1654 static const struct blk_mq_ops nvme_mq_admin_ops = {
1655         .queue_rq       = nvme_queue_rq,
1656         .complete       = nvme_pci_complete_rq,
1657         .init_hctx      = nvme_admin_init_hctx,
1658         .init_request   = nvme_pci_init_request,
1659         .timeout        = nvme_timeout,
1660 };
1661
1662 static const struct blk_mq_ops nvme_mq_ops = {
1663         .queue_rq       = nvme_queue_rq,
1664         .queue_rqs      = nvme_queue_rqs,
1665         .complete       = nvme_pci_complete_rq,
1666         .commit_rqs     = nvme_commit_rqs,
1667         .init_hctx      = nvme_init_hctx,
1668         .init_request   = nvme_pci_init_request,
1669         .map_queues     = nvme_pci_map_queues,
1670         .timeout        = nvme_timeout,
1671         .poll           = nvme_poll,
1672 };
1673
1674 static void nvme_dev_remove_admin(struct nvme_dev *dev)
1675 {
1676         if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1677                 /*
1678                  * If the controller was reset during removal, it's possible
1679                  * user requests may be waiting on a stopped queue. Start the
1680                  * queue to flush these to completion.
1681                  */
1682                 nvme_unquiesce_admin_queue(&dev->ctrl);
1683                 nvme_remove_admin_tag_set(&dev->ctrl);
1684         }
1685 }
1686
1687 static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
1688 {
1689         return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
1690 }
1691
1692 static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
1693 {
1694         struct pci_dev *pdev = to_pci_dev(dev->dev);
1695
1696         if (size <= dev->bar_mapped_size)
1697                 return 0;
1698         if (size > pci_resource_len(pdev, 0))
1699                 return -ENOMEM;
1700         if (dev->bar)
1701                 iounmap(dev->bar);
1702         dev->bar = ioremap(pci_resource_start(pdev, 0), size);
1703         if (!dev->bar) {
1704                 dev->bar_mapped_size = 0;
1705                 return -ENOMEM;
1706         }
1707         dev->bar_mapped_size = size;
1708         dev->dbs = dev->bar + NVME_REG_DBS;
1709
1710         return 0;
1711 }
1712
1713 static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
1714 {
1715         int result;
1716         u32 aqa;
1717         struct nvme_queue *nvmeq;
1718
1719         result = nvme_remap_bar(dev, db_bar_size(dev, 0));
1720         if (result < 0)
1721                 return result;
1722
1723         dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1724                                 NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1725
1726         if (dev->subsystem &&
1727             (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
1728                 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1729
1730         /*
1731          * If the device has been passed off to us in an enabled state, just
1732          * clear the enabled bit.  The spec says we should set the 'shutdown
1733          * notification bits', but doing so may cause the device to complete
1734          * commands to the admin queue ... and we don't know what memory that
1735          * might be pointing at!
1736          */
1737         result = nvme_disable_ctrl(&dev->ctrl, false);
1738         if (result < 0)
1739                 return result;
1740
1741         result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1742         if (result)
1743                 return result;
1744
1745         dev->ctrl.numa_node = dev_to_node(dev->dev);
1746
1747         nvmeq = &dev->queues[0];
1748         aqa = nvmeq->q_depth - 1;
1749         aqa |= aqa << 16;
1750
1751         writel(aqa, dev->bar + NVME_REG_AQA);
1752         lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
1753         lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
1754
1755         result = nvme_enable_ctrl(&dev->ctrl);
1756         if (result)
1757                 return result;
1758
1759         nvmeq->cq_vector = 0;
1760         nvme_init_queue(nvmeq, 0);
1761         result = queue_request_irq(nvmeq);
1762         if (result) {
1763                 dev->online_queues--;
1764                 return result;
1765         }
1766
1767         set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1768         return result;
1769 }
1770
1771 static int nvme_create_io_queues(struct nvme_dev *dev)
1772 {
1773         unsigned i, max, rw_queues;
1774         int ret = 0;
1775
1776         for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1777                 if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1778                         ret = -ENOMEM;
1779                         break;
1780                 }
1781         }
1782
1783         max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1784         if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
1785                 rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
1786                                 dev->io_queues[HCTX_TYPE_READ];
1787         } else {
1788                 rw_queues = max;
1789         }
1790
1791         for (i = dev->online_queues; i <= max; i++) {
1792                 bool polled = i > rw_queues;
1793
1794                 ret = nvme_create_queue(&dev->queues[i], i, polled);
1795                 if (ret)
1796                         break;
1797         }
1798
1799         /*
1800          * Ignore failing Create SQ/CQ commands, we can continue with less
1801          * than the desired amount of queues, and even a controller without
1802          * I/O queues can still be used to issue admin commands.  This might
1803          * be useful to upgrade a buggy firmware for example.
1804          */
1805         return ret >= 0 ? 0 : ret;
1806 }
1807
1808 static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1809 {
1810         u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
1811
1812         return 1ULL << (12 + 4 * szu);
1813 }
1814
1815 static u32 nvme_cmb_size(struct nvme_dev *dev)
1816 {
1817         return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
1818 }
1819
1820 static void nvme_map_cmb(struct nvme_dev *dev)
1821 {
1822         u64 size, offset;
1823         resource_size_t bar_size;
1824         struct pci_dev *pdev = to_pci_dev(dev->dev);
1825         int bar;
1826
1827         if (dev->cmb_size)
1828                 return;
1829
1830         if (NVME_CAP_CMBS(dev->ctrl.cap))
1831                 writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC);
1832
1833         dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1834         if (!dev->cmbsz)
1835                 return;
1836         dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1837
1838         size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
1839         offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1840         bar = NVME_CMB_BIR(dev->cmbloc);
1841         bar_size = pci_resource_len(pdev, bar);
1842
1843         if (offset > bar_size)
1844                 return;
1845
1846         /*
1847          * Tell the controller about the host side address mapping the CMB,
1848          * and enable CMB decoding for the NVMe 1.4+ scheme:
1849          */
1850         if (NVME_CAP_CMBS(dev->ctrl.cap)) {
1851                 hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
1852                              (pci_bus_address(pdev, bar) + offset),
1853                              dev->bar + NVME_REG_CMBMSC);
1854         }
1855
1856         /*
1857          * Controllers may support a CMB size larger than their BAR,
1858          * for example, due to being behind a bridge. Reduce the CMB to
1859          * the reported size of the BAR
1860          */
1861         if (size > bar_size - offset)
1862                 size = bar_size - offset;
1863
1864         if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
1865                 dev_warn(dev->ctrl.device,
1866                          "failed to register the CMB\n");
1867                 return;
1868         }
1869
1870         dev->cmb_size = size;
1871         dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
1872
1873         if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
1874                         (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
1875                 pci_p2pmem_publish(pdev, true);
1876
1877         nvme_update_attrs(dev);
1878 }
1879
1880 static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
1881 {
1882         u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
1883         u64 dma_addr = dev->host_mem_descs_dma;
1884         struct nvme_command c = { };
1885         int ret;
1886
1887         c.features.opcode       = nvme_admin_set_features;
1888         c.features.fid          = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
1889         c.features.dword11      = cpu_to_le32(bits);
1890         c.features.dword12      = cpu_to_le32(host_mem_size);
1891         c.features.dword13      = cpu_to_le32(lower_32_bits(dma_addr));
1892         c.features.dword14      = cpu_to_le32(upper_32_bits(dma_addr));
1893         c.features.dword15      = cpu_to_le32(dev->nr_host_mem_descs);
1894
1895         ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1896         if (ret) {
1897                 dev_warn(dev->ctrl.device,
1898                          "failed to set host mem (err %d, flags %#x).\n",
1899                          ret, bits);
1900         } else
1901                 dev->hmb = bits & NVME_HOST_MEM_ENABLE;
1902
1903         return ret;
1904 }
1905
1906 static void nvme_free_host_mem(struct nvme_dev *dev)
1907 {
1908         int i;
1909
1910         for (i = 0; i < dev->nr_host_mem_descs; i++) {
1911                 struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
1912                 size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
1913
1914                 dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
1915                                le64_to_cpu(desc->addr),
1916                                DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1917         }
1918
1919         kfree(dev->host_mem_desc_bufs);
1920         dev->host_mem_desc_bufs = NULL;
1921         dma_free_coherent(dev->dev,
1922                         dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
1923                         dev->host_mem_descs, dev->host_mem_descs_dma);
1924         dev->host_mem_descs = NULL;
1925         dev->nr_host_mem_descs = 0;
1926 }
1927
1928 static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
1929                 u32 chunk_size)
1930 {
1931         struct nvme_host_mem_buf_desc *descs;
1932         u32 max_entries, len;
1933         dma_addr_t descs_dma;
1934         int i = 0;
1935         void **bufs;
1936         u64 size, tmp;
1937
1938         tmp = (preferred + chunk_size - 1);
1939         do_div(tmp, chunk_size);
1940         max_entries = tmp;
1941
1942         if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
1943                 max_entries = dev->ctrl.hmmaxd;
1944
1945         descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
1946                                    &descs_dma, GFP_KERNEL);
1947         if (!descs)
1948                 goto out;
1949
1950         bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
1951         if (!bufs)
1952                 goto out_free_descs;
1953
1954         for (size = 0; size < preferred && i < max_entries; size += len) {
1955                 dma_addr_t dma_addr;
1956
1957                 len = min_t(u64, chunk_size, preferred - size);
1958                 bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
1959                                 DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1960                 if (!bufs[i])
1961                         break;
1962
1963                 descs[i].addr = cpu_to_le64(dma_addr);
1964                 descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
1965                 i++;
1966         }
1967
1968         if (!size)
1969                 goto out_free_bufs;
1970
1971         dev->nr_host_mem_descs = i;
1972         dev->host_mem_size = size;
1973         dev->host_mem_descs = descs;
1974         dev->host_mem_descs_dma = descs_dma;
1975         dev->host_mem_desc_bufs = bufs;
1976         return 0;
1977
1978 out_free_bufs:
1979         while (--i >= 0) {
1980                 size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
1981
1982                 dma_free_attrs(dev->dev, size, bufs[i],
1983                                le64_to_cpu(descs[i].addr),
1984                                DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1985         }
1986
1987         kfree(bufs);
1988 out_free_descs:
1989         dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
1990                         descs_dma);
1991 out:
1992         dev->host_mem_descs = NULL;
1993         return -ENOMEM;
1994 }
1995
1996 static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
1997 {
1998         u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
1999         u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
2000         u64 chunk_size;
2001
2002         /* start big and work our way down */
2003         for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
2004                 if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
2005                         if (!min || dev->host_mem_size >= min)
2006                                 return 0;
2007                         nvme_free_host_mem(dev);
2008                 }
2009         }
2010
2011         return -ENOMEM;
2012 }
2013
2014 static int nvme_setup_host_mem(struct nvme_dev *dev)
2015 {
2016         u64 max = (u64)max_host_mem_size_mb * SZ_1M;
2017         u64 preferred = (u64)dev->ctrl.hmpre * 4096;
2018         u64 min = (u64)dev->ctrl.hmmin * 4096;
2019         u32 enable_bits = NVME_HOST_MEM_ENABLE;
2020         int ret;
2021
2022         if (!dev->ctrl.hmpre)
2023                 return 0;
2024
2025         preferred = min(preferred, max);
2026         if (min > max) {
2027                 dev_warn(dev->ctrl.device,
2028                         "min host memory (%lld MiB) above limit (%d MiB).\n",
2029                         min >> ilog2(SZ_1M), max_host_mem_size_mb);
2030                 nvme_free_host_mem(dev);
2031                 return 0;
2032         }
2033
2034         /*
2035          * If we already have a buffer allocated check if we can reuse it.
2036          */
2037         if (dev->host_mem_descs) {
2038                 if (dev->host_mem_size >= min)
2039                         enable_bits |= NVME_HOST_MEM_RETURN;
2040                 else
2041                         nvme_free_host_mem(dev);
2042         }
2043
2044         if (!dev->host_mem_descs) {
2045                 if (nvme_alloc_host_mem(dev, min, preferred)) {
2046                         dev_warn(dev->ctrl.device,
2047                                 "failed to allocate host memory buffer.\n");
2048                         return 0; /* controller must work without HMB */
2049                 }
2050
2051                 dev_info(dev->ctrl.device,
2052                         "allocated %lld MiB host memory buffer.\n",
2053                         dev->host_mem_size >> ilog2(SZ_1M));
2054         }
2055
2056         ret = nvme_set_host_mem(dev, enable_bits);
2057         if (ret)
2058                 nvme_free_host_mem(dev);
2059         return ret;
2060 }
2061
2062 static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
2063                 char *buf)
2064 {
2065         struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
2066
2067         return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz  : x%08x\n",
2068                        ndev->cmbloc, ndev->cmbsz);
2069 }
2070 static DEVICE_ATTR_RO(cmb);
2071
2072 static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr,
2073                 char *buf)
2074 {
2075         struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
2076
2077         return sysfs_emit(buf, "%u\n", ndev->cmbloc);
2078 }
2079 static DEVICE_ATTR_RO(cmbloc);
2080
2081 static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr,
2082                 char *buf)
2083 {
2084         struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
2085
2086         return sysfs_emit(buf, "%u\n", ndev->cmbsz);
2087 }
2088 static DEVICE_ATTR_RO(cmbsz);
2089
2090 static ssize_t hmb_show(struct device *dev, struct device_attribute *attr,
2091                         char *buf)
2092 {
2093         struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
2094
2095         return sysfs_emit(buf, "%d\n", ndev->hmb);
2096 }
2097
2098 static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
2099                          const char *buf, size_t count)
2100 {
2101         struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
2102         bool new;
2103         int ret;
2104
2105         if (kstrtobool(buf, &new) < 0)
2106                 return -EINVAL;
2107
2108         if (new == ndev->hmb)
2109                 return count;
2110
2111         if (new) {
2112                 ret = nvme_setup_host_mem(ndev);
2113         } else {
2114                 ret = nvme_set_host_mem(ndev, 0);
2115                 if (!ret)
2116                         nvme_free_host_mem(ndev);
2117         }
2118
2119         if (ret < 0)
2120                 return ret;
2121
2122         return count;
2123 }
2124 static DEVICE_ATTR_RW(hmb);
2125
2126 static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj,
2127                 struct attribute *a, int n)
2128 {
2129         struct nvme_ctrl *ctrl =
2130                 dev_get_drvdata(container_of(kobj, struct device, kobj));
2131         struct nvme_dev *dev = to_nvme_dev(ctrl);
2132
2133         if (a == &dev_attr_cmb.attr ||
2134             a == &dev_attr_cmbloc.attr ||
2135             a == &dev_attr_cmbsz.attr) {
2136                 if (!dev->cmbsz)
2137                         return 0;
2138         }
2139         if (a == &dev_attr_hmb.attr && !ctrl->hmpre)
2140                 return 0;
2141
2142         return a->mode;
2143 }
2144
2145 static struct attribute *nvme_pci_attrs[] = {
2146         &dev_attr_cmb.attr,
2147         &dev_attr_cmbloc.attr,
2148         &dev_attr_cmbsz.attr,
2149         &dev_attr_hmb.attr,
2150         NULL,
2151 };
2152
2153 static const struct attribute_group nvme_pci_dev_attrs_group = {
2154         .attrs          = nvme_pci_attrs,
2155         .is_visible     = nvme_pci_attrs_are_visible,
2156 };
2157
2158 static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
2159         &nvme_dev_attrs_group,
2160         &nvme_pci_dev_attrs_group,
2161         NULL,
2162 };
2163
2164 static void nvme_update_attrs(struct nvme_dev *dev)
2165 {
2166         sysfs_update_group(&dev->ctrl.device->kobj, &nvme_pci_dev_attrs_group);
2167 }
2168
2169 /*
2170  * nirqs is the number of interrupts available for write and read
2171  * queues. The core already reserved an interrupt for the admin queue.
2172  */
2173 static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
2174 {
2175         struct nvme_dev *dev = affd->priv;
2176         unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
2177
2178         /*
2179          * If there is no interrupt available for queues, ensure that
2180          * the default queue is set to 1. The affinity set size is
2181          * also set to one, but the irq core ignores it for this case.
2182          *
2183          * If only one interrupt is available or 'write_queue' == 0, combine
2184          * write and read queues.
2185          *
2186          * If 'write_queues' > 0, ensure it leaves room for at least one read
2187          * queue.
2188          */
2189         if (!nrirqs) {
2190                 nrirqs = 1;
2191                 nr_read_queues = 0;
2192         } else if (nrirqs == 1 || !nr_write_queues) {
2193                 nr_read_queues = 0;
2194         } else if (nr_write_queues >= nrirqs) {
2195                 nr_read_queues = 1;
2196         } else {
2197                 nr_read_queues = nrirqs - nr_write_queues;
2198         }
2199
2200         dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
2201         affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
2202         dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
2203         affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
2204         affd->nr_sets = nr_read_queues ? 2 : 1;
2205 }
2206
2207 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2208 {
2209         struct pci_dev *pdev = to_pci_dev(dev->dev);
2210         struct irq_affinity affd = {
2211                 .pre_vectors    = 1,
2212                 .calc_sets      = nvme_calc_irq_sets,
2213                 .priv           = dev,
2214         };
2215         unsigned int irq_queues, poll_queues;
2216
2217         /*
2218          * Poll queues don't need interrupts, but we need at least one I/O queue
2219          * left over for non-polled I/O.
2220          */
2221         poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
2222         dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
2223
2224         /*
2225          * Initialize for the single interrupt case, will be updated in
2226          * nvme_calc_irq_sets().
2227          */
2228         dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
2229         dev->io_queues[HCTX_TYPE_READ] = 0;
2230
2231         /*
2232          * We need interrupts for the admin queue and each non-polled I/O queue,
2233          * but some Apple controllers require all queues to use the first
2234          * vector.
2235          */
2236         irq_queues = 1;
2237         if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
2238                 irq_queues += (nr_io_queues - poll_queues);
2239         return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
2240                               PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2241 }
2242
2243 static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
2244 {
2245         /*
2246          * If tags are shared with admin queue (Apple bug), then
2247          * make sure we only use one IO queue.
2248          */
2249         if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
2250                 return 1;
2251         return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
2252 }
2253
2254 static int nvme_setup_io_queues(struct nvme_dev *dev)
2255 {
2256         struct nvme_queue *adminq = &dev->queues[0];
2257         struct pci_dev *pdev = to_pci_dev(dev->dev);
2258         unsigned int nr_io_queues;
2259         unsigned long size;
2260         int result;
2261
2262         /*
2263          * Sample the module parameters once at reset time so that we have
2264          * stable values to work with.
2265          */
2266         dev->nr_write_queues = write_queues;
2267         dev->nr_poll_queues = poll_queues;
2268
2269         nr_io_queues = dev->nr_allocated_queues - 1;
2270         result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
2271         if (result < 0)
2272                 return result;
2273
2274         if (nr_io_queues == 0)
2275                 return 0;
2276
2277         /*
2278          * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions
2279          * from set to unset. If there is a window to it is truely freed,
2280          * pci_free_irq_vectors() jumping into this window will crash.
2281          * And take lock to avoid racing with pci_free_irq_vectors() in
2282          * nvme_dev_disable() path.
2283          */
2284         result = nvme_setup_io_queues_trylock(dev);
2285         if (result)
2286                 return result;
2287         if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
2288                 pci_free_irq(pdev, 0, adminq);
2289
2290         if (dev->cmb_use_sqes) {
2291                 result = nvme_cmb_qdepth(dev, nr_io_queues,
2292                                 sizeof(struct nvme_command));
2293                 if (result > 0) {
2294                         dev->q_depth = result;
2295                         dev->ctrl.sqsize = result - 1;
2296                 } else {
2297                         dev->cmb_use_sqes = false;
2298                 }
2299         }
2300
2301         do {
2302                 size = db_bar_size(dev, nr_io_queues);
2303                 result = nvme_remap_bar(dev, size);
2304                 if (!result)
2305                         break;
2306                 if (!--nr_io_queues) {
2307                         result = -ENOMEM;
2308                         goto out_unlock;
2309                 }
2310         } while (1);
2311         adminq->q_db = dev->dbs;
2312
2313  retry:
2314         /* Deregister the admin queue's interrupt */
2315         if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
2316                 pci_free_irq(pdev, 0, adminq);
2317
2318         /*
2319          * If we enable msix early due to not intx, disable it again before
2320          * setting up the full range we need.
2321          */
2322         pci_free_irq_vectors(pdev);
2323
2324         result = nvme_setup_irqs(dev, nr_io_queues);
2325         if (result <= 0) {
2326                 result = -EIO;
2327                 goto out_unlock;
2328         }
2329
2330         dev->num_vecs = result;
2331         result = max(result - 1, 1);
2332         dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
2333
2334         /*
2335          * Should investigate if there's a performance win from allocating
2336          * more queues than interrupt vectors; it might allow the submission
2337          * path to scale better, even if the receive path is limited by the
2338          * number of interrupts.
2339          */
2340         result = queue_request_irq(adminq);
2341         if (result)
2342                 goto out_unlock;
2343         set_bit(NVMEQ_ENABLED, &adminq->flags);
2344         mutex_unlock(&dev->shutdown_lock);
2345
2346         result = nvme_create_io_queues(dev);
2347         if (result || dev->online_queues < 2)
2348                 return result;
2349
2350         if (dev->online_queues - 1 < dev->max_qid) {
2351                 nr_io_queues = dev->online_queues - 1;
2352                 nvme_delete_io_queues(dev);
2353                 result = nvme_setup_io_queues_trylock(dev);
2354                 if (result)
2355                         return result;
2356                 nvme_suspend_io_queues(dev);
2357                 goto retry;
2358         }
2359         dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
2360                                         dev->io_queues[HCTX_TYPE_DEFAULT],
2361                                         dev->io_queues[HCTX_TYPE_READ],
2362                                         dev->io_queues[HCTX_TYPE_POLL]);
2363         return 0;
2364 out_unlock:
2365         mutex_unlock(&dev->shutdown_lock);
2366         return result;
2367 }
2368
2369 static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
2370                                              blk_status_t error)
2371 {
2372         struct nvme_queue *nvmeq = req->end_io_data;
2373
2374         blk_mq_free_request(req);
2375         complete(&nvmeq->delete_done);
2376         return RQ_END_IO_NONE;
2377 }
2378
2379 static enum rq_end_io_ret nvme_del_cq_end(struct request *req,
2380                                           blk_status_t error)
2381 {
2382         struct nvme_queue *nvmeq = req->end_io_data;
2383
2384         if (error)
2385                 set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
2386
2387         return nvme_del_queue_end(req, error);
2388 }
2389
2390 static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2391 {
2392         struct request_queue *q = nvmeq->dev->ctrl.admin_q;
2393         struct request *req;
2394         struct nvme_command cmd = { };
2395
2396         cmd.delete_queue.opcode = opcode;
2397         cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2398
2399         req = blk_mq_alloc_request(q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT);
2400         if (IS_ERR(req))
2401                 return PTR_ERR(req);
2402         nvme_init_request(req, &cmd);
2403
2404         if (opcode == nvme_admin_delete_cq)
2405                 req->end_io = nvme_del_cq_end;
2406         else
2407                 req->end_io = nvme_del_queue_end;
2408         req->end_io_data = nvmeq;
2409
2410         init_completion(&nvmeq->delete_done);
2411         blk_execute_rq_nowait(req, false);
2412         return 0;
2413 }
2414
2415 static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode)
2416 {
2417         int nr_queues = dev->online_queues - 1, sent = 0;
2418         unsigned long timeout;
2419
2420  retry:
2421         timeout = NVME_ADMIN_TIMEOUT;
2422         while (nr_queues > 0) {
2423                 if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
2424                         break;
2425                 nr_queues--;
2426                 sent++;
2427         }
2428         while (sent) {
2429                 struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
2430
2431                 timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2432                                 timeout);
2433                 if (timeout == 0)
2434                         return false;
2435
2436                 sent--;
2437                 if (nr_queues)
2438                         goto retry;
2439         }
2440         return true;
2441 }
2442
2443 static void nvme_delete_io_queues(struct nvme_dev *dev)
2444 {
2445         if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq))
2446                 __nvme_delete_io_queues(dev, nvme_admin_delete_cq);
2447 }
2448
2449 static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev)
2450 {
2451         if (dev->io_queues[HCTX_TYPE_POLL])
2452                 return 3;
2453         if (dev->io_queues[HCTX_TYPE_READ])
2454                 return 2;
2455         return 1;
2456 }
2457
2458 static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
2459 {
2460         blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
2461         /* free previously allocated queues that are no longer usable */
2462         nvme_free_queues(dev, dev->online_queues);
2463 }
2464
2465 static int nvme_pci_enable(struct nvme_dev *dev)
2466 {
2467         int result = -ENOMEM;
2468         struct pci_dev *pdev = to_pci_dev(dev->dev);
2469
2470         if (pci_enable_device_mem(pdev))
2471                 return result;
2472
2473         pci_set_master(pdev);
2474
2475         if (readl(dev->bar + NVME_REG_CSTS) == -1) {
2476                 result = -ENODEV;
2477                 goto disable;
2478         }
2479
2480         /*
2481          * Some devices and/or platforms don't advertise or work with INTx
2482          * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
2483          * adjust this later.
2484          */
2485         result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
2486         if (result < 0)
2487                 goto disable;
2488
2489         dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2490
2491         dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2492                                 io_queue_depth);
2493         dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2494         dev->dbs = dev->bar + 4096;
2495
2496         /*
2497          * Some Apple controllers require a non-standard SQE size.
2498          * Interestingly they also seem to ignore the CC:IOSQES register
2499          * so we don't bother updating it here.
2500          */
2501         if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
2502                 dev->io_sqes = 7;
2503         else
2504                 dev->io_sqes = NVME_NVM_IOSQES;
2505
2506         /*
2507          * Temporary fix for the Apple controller found in the MacBook8,1 and
2508          * some MacBook7,1 to avoid controller resets and data loss.
2509          */
2510         if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
2511                 dev->q_depth = 2;
2512                 dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
2513                         "set queue depth=%u to work around controller resets\n",
2514                         dev->q_depth);
2515         } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
2516                    (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2517                    NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2518                 dev->q_depth = 64;
2519                 dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
2520                         "set queue depth=%u\n", dev->q_depth);
2521         }
2522
2523         /*
2524          * Controllers with the shared tags quirk need the IO queue to be
2525          * big enough so that we get 32 tags for the admin queue
2526          */
2527         if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
2528             (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
2529                 dev->q_depth = NVME_AQ_DEPTH + 2;
2530                 dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
2531                          dev->q_depth);
2532         }
2533         dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
2534
2535         nvme_map_cmb(dev);
2536
2537         pci_save_state(pdev);
2538
2539         result = nvme_pci_configure_admin_queue(dev);
2540         if (result)
2541                 goto free_irq;
2542         return result;
2543
2544  free_irq:
2545         pci_free_irq_vectors(pdev);
2546  disable:
2547         pci_disable_device(pdev);
2548         return result;
2549 }
2550
2551 static void nvme_dev_unmap(struct nvme_dev *dev)
2552 {
2553         if (dev->bar)
2554                 iounmap(dev->bar);
2555         pci_release_mem_regions(to_pci_dev(dev->dev));
2556 }
2557
2558 static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)
2559 {
2560         struct pci_dev *pdev = to_pci_dev(dev->dev);
2561         u32 csts;
2562
2563         if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev))
2564                 return true;
2565         if (pdev->error_state != pci_channel_io_normal)
2566                 return true;
2567
2568         csts = readl(dev->bar + NVME_REG_CSTS);
2569         return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY);
2570 }
2571
2572 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
2573 {
2574         struct pci_dev *pdev = to_pci_dev(dev->dev);
2575         bool dead;
2576
2577         mutex_lock(&dev->shutdown_lock);
2578         dead = nvme_pci_ctrl_is_dead(dev);
2579         if (dev->ctrl.state == NVME_CTRL_LIVE ||
2580             dev->ctrl.state == NVME_CTRL_RESETTING) {
2581                 if (pci_is_enabled(pdev))
2582                         nvme_start_freeze(&dev->ctrl);
2583                 /*
2584                  * Give the controller a chance to complete all entered requests
2585                  * if doing a safe shutdown.
2586                  */
2587                 if (!dead && shutdown)
2588                         nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2589         }
2590
2591         nvme_quiesce_io_queues(&dev->ctrl);
2592
2593         if (!dead && dev->ctrl.queue_count > 0) {
2594                 nvme_delete_io_queues(dev);
2595                 nvme_disable_ctrl(&dev->ctrl, shutdown);
2596                 nvme_poll_irqdisable(&dev->queues[0]);
2597         }
2598         nvme_suspend_io_queues(dev);
2599         nvme_suspend_queue(dev, 0);
2600         pci_free_irq_vectors(pdev);
2601         if (pci_is_enabled(pdev))
2602                 pci_disable_device(pdev);
2603         nvme_reap_pending_cqes(dev);
2604
2605         nvme_cancel_tagset(&dev->ctrl);
2606         nvme_cancel_admin_tagset(&dev->ctrl);
2607
2608         /*
2609          * The driver will not be starting up queues again if shutting down so
2610          * must flush all entered requests to their failed completion to avoid
2611          * deadlocking blk-mq hot-cpu notifier.
2612          */
2613         if (shutdown) {
2614                 nvme_unquiesce_io_queues(&dev->ctrl);
2615                 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
2616                         nvme_unquiesce_admin_queue(&dev->ctrl);
2617         }
2618         mutex_unlock(&dev->shutdown_lock);
2619 }
2620
2621 static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
2622 {
2623         if (!nvme_wait_reset(&dev->ctrl))
2624                 return -EBUSY;
2625         nvme_dev_disable(dev, shutdown);
2626         return 0;
2627 }
2628
2629 static int nvme_setup_prp_pools(struct nvme_dev *dev)
2630 {
2631         dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
2632                                                 NVME_CTRL_PAGE_SIZE,
2633                                                 NVME_CTRL_PAGE_SIZE, 0);
2634         if (!dev->prp_page_pool)
2635                 return -ENOMEM;
2636
2637         /* Optimisation for I/Os between 4k and 128k */
2638         dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2639                                                 256, 256, 0);
2640         if (!dev->prp_small_pool) {
2641                 dma_pool_destroy(dev->prp_page_pool);
2642                 return -ENOMEM;
2643         }
2644         return 0;
2645 }
2646
2647 static void nvme_release_prp_pools(struct nvme_dev *dev)
2648 {
2649         dma_pool_destroy(dev->prp_page_pool);
2650         dma_pool_destroy(dev->prp_small_pool);
2651 }
2652
2653 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
2654 {
2655         size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
2656
2657         dev->iod_mempool = mempool_create_node(1,
2658                         mempool_kmalloc, mempool_kfree,
2659                         (void *)alloc_size, GFP_KERNEL,
2660                         dev_to_node(dev->dev));
2661         if (!dev->iod_mempool)
2662                 return -ENOMEM;
2663         return 0;
2664 }
2665
2666 static void nvme_free_tagset(struct nvme_dev *dev)
2667 {
2668         if (dev->tagset.tags)
2669                 nvme_remove_io_tag_set(&dev->ctrl);
2670         dev->ctrl.tagset = NULL;
2671 }
2672
2673 /* pairs with nvme_pci_alloc_dev */
2674 static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2675 {
2676         struct nvme_dev *dev = to_nvme_dev(ctrl);
2677
2678         nvme_free_tagset(dev);
2679         put_device(dev->dev);
2680         kfree(dev->queues);
2681         kfree(dev);
2682 }
2683
2684 static void nvme_reset_work(struct work_struct *work)
2685 {
2686         struct nvme_dev *dev =
2687                 container_of(work, struct nvme_dev, ctrl.reset_work);
2688         bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2689         int result;
2690
2691         if (dev->ctrl.state != NVME_CTRL_RESETTING) {
2692                 dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
2693                          dev->ctrl.state);
2694                 return;
2695         }
2696
2697         /*
2698          * If we're called to reset a live controller first shut it down before
2699          * moving on.
2700          */
2701         if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2702                 nvme_dev_disable(dev, false);
2703         nvme_sync_queues(&dev->ctrl);
2704
2705         mutex_lock(&dev->shutdown_lock);
2706         result = nvme_pci_enable(dev);
2707         if (result)
2708                 goto out_unlock;
2709         nvme_unquiesce_admin_queue(&dev->ctrl);
2710         mutex_unlock(&dev->shutdown_lock);
2711
2712         /*
2713          * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
2714          * initializing procedure here.
2715          */
2716         if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
2717                 dev_warn(dev->ctrl.device,
2718                         "failed to mark controller CONNECTING\n");
2719                 result = -EBUSY;
2720                 goto out;
2721         }
2722
2723         result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
2724         if (result)
2725                 goto out;
2726
2727         nvme_dbbuf_dma_alloc(dev);
2728
2729         result = nvme_setup_host_mem(dev);
2730         if (result < 0)
2731                 goto out;
2732
2733         result = nvme_setup_io_queues(dev);
2734         if (result)
2735                 goto out;
2736
2737         /*
2738          * Freeze and update the number of I/O queues as thos might have
2739          * changed.  If there are no I/O queues left after this reset, keep the
2740          * controller around but remove all namespaces.
2741          */
2742         if (dev->online_queues > 1) {
2743                 nvme_unquiesce_io_queues(&dev->ctrl);
2744                 nvme_wait_freeze(&dev->ctrl);
2745                 nvme_pci_update_nr_queues(dev);
2746                 nvme_dbbuf_set(dev);
2747                 nvme_unfreeze(&dev->ctrl);
2748         } else {
2749                 dev_warn(dev->ctrl.device, "IO queues lost\n");
2750                 nvme_mark_namespaces_dead(&dev->ctrl);
2751                 nvme_unquiesce_io_queues(&dev->ctrl);
2752                 nvme_remove_namespaces(&dev->ctrl);
2753                 nvme_free_tagset(dev);
2754         }
2755
2756         /*
2757          * If only admin queue live, keep it to do further investigation or
2758          * recovery.
2759          */
2760         if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
2761                 dev_warn(dev->ctrl.device,
2762                         "failed to mark controller live state\n");
2763                 result = -ENODEV;
2764                 goto out;
2765         }
2766
2767         nvme_start_ctrl(&dev->ctrl);
2768         return;
2769
2770  out_unlock:
2771         mutex_unlock(&dev->shutdown_lock);
2772  out:
2773         /*
2774          * Set state to deleting now to avoid blocking nvme_wait_reset(), which
2775          * may be holding this pci_dev's device lock.
2776          */
2777         dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
2778                  result);
2779         nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
2780         nvme_dev_disable(dev, true);
2781         nvme_mark_namespaces_dead(&dev->ctrl);
2782         nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
2783 }
2784
2785 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
2786 {
2787         *val = readl(to_nvme_dev(ctrl)->bar + off);
2788         return 0;
2789 }
2790
2791 static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2792 {
2793         writel(val, to_nvme_dev(ctrl)->bar + off);
2794         return 0;
2795 }
2796
2797 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
2798 {
2799         *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
2800         return 0;
2801 }
2802
2803 static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
2804 {
2805         struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
2806
2807         return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
2808 }
2809
2810 static void nvme_pci_print_device_info(struct nvme_ctrl *ctrl)
2811 {
2812         struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
2813         struct nvme_subsystem *subsys = ctrl->subsys;
2814
2815         dev_err(ctrl->device,
2816                 "VID:DID %04x:%04x model:%.*s firmware:%.*s\n",
2817                 pdev->vendor, pdev->device,
2818                 nvme_strlen(subsys->model, sizeof(subsys->model)),
2819                 subsys->model, nvme_strlen(subsys->firmware_rev,
2820                                            sizeof(subsys->firmware_rev)),
2821                 subsys->firmware_rev);
2822 }
2823
2824 static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl)
2825 {
2826         struct nvme_dev *dev = to_nvme_dev(ctrl);
2827
2828         return dma_pci_p2pdma_supported(dev->dev);
2829 }
2830
2831 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
2832         .name                   = "pcie",
2833         .module                 = THIS_MODULE,
2834         .flags                  = NVME_F_METADATA_SUPPORTED,
2835         .dev_attr_groups        = nvme_pci_dev_attr_groups,
2836         .reg_read32             = nvme_pci_reg_read32,
2837         .reg_write32            = nvme_pci_reg_write32,
2838         .reg_read64             = nvme_pci_reg_read64,
2839         .free_ctrl              = nvme_pci_free_ctrl,
2840         .submit_async_event     = nvme_pci_submit_async_event,
2841         .get_address            = nvme_pci_get_address,
2842         .print_device_info      = nvme_pci_print_device_info,
2843         .supports_pci_p2pdma    = nvme_pci_supports_pci_p2pdma,
2844 };
2845
2846 static int nvme_dev_map(struct nvme_dev *dev)
2847 {
2848         struct pci_dev *pdev = to_pci_dev(dev->dev);
2849
2850         if (pci_request_mem_regions(pdev, "nvme"))
2851                 return -ENODEV;
2852
2853         if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2854                 goto release;
2855
2856         return 0;
2857   release:
2858         pci_release_mem_regions(pdev);
2859         return -ENODEV;
2860 }
2861
2862 static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2863 {
2864         if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
2865                 /*
2866                  * Several Samsung devices seem to drop off the PCIe bus
2867                  * randomly when APST is on and uses the deepest sleep state.
2868                  * This has been observed on a Samsung "SM951 NVMe SAMSUNG
2869                  * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
2870                  * 950 PRO 256GB", but it seems to be restricted to two Dell
2871                  * laptops.
2872                  */
2873                 if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
2874                     (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
2875                      dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
2876                         return NVME_QUIRK_NO_DEEPEST_PS;
2877         } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
2878                 /*
2879                  * Samsung SSD 960 EVO drops off the PCIe bus after system
2880                  * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
2881                  * within few minutes after bootup on a Coffee Lake board -
2882                  * ASUS PRIME Z370-A
2883                  */
2884                 if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2885                     (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
2886                      dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2887                         return NVME_QUIRK_NO_APST;
2888         } else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
2889                     pdev->device == 0xa808 || pdev->device == 0xa809)) ||
2890                    (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
2891                 /*
2892                  * Forcing to use host managed nvme power settings for
2893                  * lowest idle power with quick resume latency on
2894                  * Samsung and Toshiba SSDs based on suspend behavior
2895                  * on Coffee Lake board for LENOVO C640
2896                  */
2897                 if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
2898                      dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
2899                         return NVME_QUIRK_SIMPLE_SUSPEND;
2900         }
2901
2902         return 0;
2903 }
2904
2905 static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
2906                 const struct pci_device_id *id)
2907 {
2908         unsigned long quirks = id->driver_data;
2909         int node = dev_to_node(&pdev->dev);
2910         struct nvme_dev *dev;
2911         int ret = -ENOMEM;
2912
2913         if (node == NUMA_NO_NODE)
2914                 set_dev_node(&pdev->dev, first_memory_node);
2915
2916         dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
2917         if (!dev)
2918                 return ERR_PTR(-ENOMEM);
2919         INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2920         mutex_init(&dev->shutdown_lock);
2921
2922         dev->nr_write_queues = write_queues;
2923         dev->nr_poll_queues = poll_queues;
2924         dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
2925         dev->queues = kcalloc_node(dev->nr_allocated_queues,
2926                         sizeof(struct nvme_queue), GFP_KERNEL, node);
2927         if (!dev->queues)
2928                 goto out_free_dev;
2929
2930         dev->dev = get_device(&pdev->dev);
2931
2932         quirks |= check_vendor_combination_bug(pdev);
2933         if (!noacpi && acpi_storage_d3(&pdev->dev)) {
2934                 /*
2935                  * Some systems use a bios work around to ask for D3 on
2936                  * platforms that support kernel managed suspend.
2937                  */
2938                 dev_info(&pdev->dev,
2939                          "platform quirk: setting simple suspend\n");
2940                 quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
2941         }
2942         ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
2943                              quirks);
2944         if (ret)
2945                 goto out_put_device;
2946
2947         if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
2948                 dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48));
2949         else
2950                 dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
2951         dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
2952         dma_set_max_seg_size(&pdev->dev, 0xffffffff);
2953
2954         /*
2955          * Limit the max command size to prevent iod->sg allocations going
2956          * over a single page.
2957          */
2958         dev->ctrl.max_hw_sectors = min_t(u32,
2959                 NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
2960         dev->ctrl.max_segments = NVME_MAX_SEGS;
2961
2962         /*
2963          * There is no support for SGLs for metadata (yet), so we are limited to
2964          * a single integrity segment for the separate metadata pointer.
2965          */
2966         dev->ctrl.max_integrity_segments = 1;
2967         return dev;
2968
2969 out_put_device:
2970         put_device(dev->dev);
2971         kfree(dev->queues);
2972 out_free_dev:
2973         kfree(dev);
2974         return ERR_PTR(ret);
2975 }
2976
2977 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2978 {
2979         struct nvme_dev *dev;
2980         int result = -ENOMEM;
2981
2982         dev = nvme_pci_alloc_dev(pdev, id);
2983         if (IS_ERR(dev))
2984                 return PTR_ERR(dev);
2985
2986         result = nvme_dev_map(dev);
2987         if (result)
2988                 goto out_uninit_ctrl;
2989
2990         result = nvme_setup_prp_pools(dev);
2991         if (result)
2992                 goto out_dev_unmap;
2993
2994         result = nvme_pci_alloc_iod_mempool(dev);
2995         if (result)
2996                 goto out_release_prp_pools;
2997
2998         dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
2999
3000         result = nvme_pci_enable(dev);
3001         if (result)
3002                 goto out_release_iod_mempool;
3003
3004         result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
3005                                 &nvme_mq_admin_ops, sizeof(struct nvme_iod));
3006         if (result)
3007                 goto out_disable;
3008
3009         /*
3010          * Mark the controller as connecting before sending admin commands to
3011          * allow the timeout handler to do the right thing.
3012          */
3013         if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
3014                 dev_warn(dev->ctrl.device,
3015                         "failed to mark controller CONNECTING\n");
3016                 result = -EBUSY;
3017                 goto out_disable;
3018         }
3019
3020         result = nvme_init_ctrl_finish(&dev->ctrl, false);
3021         if (result)
3022                 goto out_disable;
3023
3024         nvme_dbbuf_dma_alloc(dev);
3025
3026         result = nvme_setup_host_mem(dev);
3027         if (result < 0)
3028                 goto out_disable;
3029
3030         result = nvme_setup_io_queues(dev);
3031         if (result)
3032                 goto out_disable;
3033
3034         if (dev->online_queues > 1) {
3035                 nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
3036                                 nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
3037                 nvme_dbbuf_set(dev);
3038         }
3039
3040         if (!dev->ctrl.tagset)
3041                 dev_warn(dev->ctrl.device, "IO queues not created\n");
3042
3043         if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
3044                 dev_warn(dev->ctrl.device,
3045                         "failed to mark controller live state\n");
3046                 result = -ENODEV;
3047                 goto out_disable;
3048         }
3049
3050         pci_set_drvdata(pdev, dev);
3051
3052         nvme_start_ctrl(&dev->ctrl);
3053         nvme_put_ctrl(&dev->ctrl);
3054         flush_work(&dev->ctrl.scan_work);
3055         return 0;
3056
3057 out_disable:
3058         nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
3059         nvme_dev_disable(dev, true);
3060         nvme_free_host_mem(dev);
3061         nvme_dev_remove_admin(dev);
3062         nvme_dbbuf_dma_free(dev);
3063         nvme_free_queues(dev, 0);
3064 out_release_iod_mempool:
3065         mempool_destroy(dev->iod_mempool);
3066 out_release_prp_pools:
3067         nvme_release_prp_pools(dev);
3068 out_dev_unmap:
3069         nvme_dev_unmap(dev);
3070 out_uninit_ctrl:
3071         nvme_uninit_ctrl(&dev->ctrl);
3072         return result;
3073 }
3074
3075 static void nvme_reset_prepare(struct pci_dev *pdev)
3076 {
3077         struct nvme_dev *dev = pci_get_drvdata(pdev);
3078
3079         /*
3080          * We don't need to check the return value from waiting for the reset
3081          * state as pci_dev device lock is held, making it impossible to race
3082          * with ->remove().
3083          */
3084         nvme_disable_prepare_reset(dev, false);
3085         nvme_sync_queues(&dev->ctrl);
3086 }
3087
3088 static void nvme_reset_done(struct pci_dev *pdev)
3089 {
3090         struct nvme_dev *dev = pci_get_drvdata(pdev);
3091
3092         if (!nvme_try_sched_reset(&dev->ctrl))
3093                 flush_work(&dev->ctrl.reset_work);
3094 }
3095
3096 static void nvme_shutdown(struct pci_dev *pdev)
3097 {
3098         struct nvme_dev *dev = pci_get_drvdata(pdev);
3099
3100         nvme_disable_prepare_reset(dev, true);
3101 }
3102
3103 /*
3104  * The driver's remove may be called on a device in a partially initialized
3105  * state. This function must not have any dependencies on the device state in
3106  * order to proceed.
3107  */
3108 static void nvme_remove(struct pci_dev *pdev)
3109 {
3110         struct nvme_dev *dev = pci_get_drvdata(pdev);
3111
3112         nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
3113         pci_set_drvdata(pdev, NULL);
3114
3115         if (!pci_device_is_present(pdev)) {
3116                 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
3117                 nvme_dev_disable(dev, true);
3118         }
3119
3120         flush_work(&dev->ctrl.reset_work);
3121         nvme_stop_ctrl(&dev->ctrl);
3122         nvme_remove_namespaces(&dev->ctrl);
3123         nvme_dev_disable(dev, true);
3124         nvme_free_host_mem(dev);
3125         nvme_dev_remove_admin(dev);
3126         nvme_dbbuf_dma_free(dev);
3127         nvme_free_queues(dev, 0);
3128         mempool_destroy(dev->iod_mempool);
3129         nvme_release_prp_pools(dev);
3130         nvme_dev_unmap(dev);
3131         nvme_uninit_ctrl(&dev->ctrl);
3132 }
3133
3134 #ifdef CONFIG_PM_SLEEP
3135 static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
3136 {
3137         return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
3138 }
3139
3140 static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
3141 {
3142         return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
3143 }
3144
3145 static int nvme_resume(struct device *dev)
3146 {
3147         struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
3148         struct nvme_ctrl *ctrl = &ndev->ctrl;
3149
3150         if (ndev->last_ps == U32_MAX ||
3151             nvme_set_power_state(ctrl, ndev->last_ps) != 0)
3152                 goto reset;
3153         if (ctrl->hmpre && nvme_setup_host_mem(ndev))
3154                 goto reset;
3155
3156         return 0;
3157 reset:
3158         return nvme_try_sched_reset(ctrl);
3159 }
3160
3161 static int nvme_suspend(struct device *dev)
3162 {
3163         struct pci_dev *pdev = to_pci_dev(dev);
3164         struct nvme_dev *ndev = pci_get_drvdata(pdev);
3165         struct nvme_ctrl *ctrl = &ndev->ctrl;
3166         int ret = -EBUSY;
3167
3168         ndev->last_ps = U32_MAX;
3169
3170         /*
3171          * The platform does not remove power for a kernel managed suspend so
3172          * use host managed nvme power settings for lowest idle power if
3173          * possible. This should have quicker resume latency than a full device
3174          * shutdown.  But if the firmware is involved after the suspend or the
3175          * device does not support any non-default power states, shut down the
3176          * device fully.
3177          *
3178          * If ASPM is not enabled for the device, shut down the device and allow
3179          * the PCI bus layer to put it into D3 in order to take the PCIe link
3180          * down, so as to allow the platform to achieve its minimum low-power
3181          * state (which may not be possible if the link is up).
3182          */
3183         if (pm_suspend_via_firmware() || !ctrl->npss ||
3184             !pcie_aspm_enabled(pdev) ||
3185             (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
3186                 return nvme_disable_prepare_reset(ndev, true);
3187
3188         nvme_start_freeze(ctrl);
3189         nvme_wait_freeze(ctrl);
3190         nvme_sync_queues(ctrl);
3191
3192         if (ctrl->state != NVME_CTRL_LIVE)
3193                 goto unfreeze;
3194
3195         /*
3196          * Host memory access may not be successful in a system suspend state,
3197          * but the specification allows the controller to access memory in a
3198          * non-operational power state.
3199          */
3200         if (ndev->hmb) {
3201                 ret = nvme_set_host_mem(ndev, 0);
3202                 if (ret < 0)
3203                         goto unfreeze;
3204         }
3205
3206         ret = nvme_get_power_state(ctrl, &ndev->last_ps);
3207         if (ret < 0)
3208                 goto unfreeze;
3209
3210         /*
3211          * A saved state prevents pci pm from generically controlling the
3212          * device's power. If we're using protocol specific settings, we don't
3213          * want pci interfering.
3214          */
3215         pci_save_state(pdev);
3216
3217         ret = nvme_set_power_state(ctrl, ctrl->npss);
3218         if (ret < 0)
3219                 goto unfreeze;
3220
3221         if (ret) {
3222                 /* discard the saved state */
3223                 pci_load_saved_state(pdev, NULL);
3224
3225                 /*
3226                  * Clearing npss forces a controller reset on resume. The
3227                  * correct value will be rediscovered then.
3228                  */
3229                 ret = nvme_disable_prepare_reset(ndev, true);
3230                 ctrl->npss = 0;
3231         }
3232 unfreeze:
3233         nvme_unfreeze(ctrl);
3234         return ret;
3235 }
3236
3237 static int nvme_simple_suspend(struct device *dev)
3238 {
3239         struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
3240
3241         return nvme_disable_prepare_reset(ndev, true);
3242 }
3243
3244 static int nvme_simple_resume(struct device *dev)
3245 {
3246         struct pci_dev *pdev = to_pci_dev(dev);
3247         struct nvme_dev *ndev = pci_get_drvdata(pdev);
3248
3249         return nvme_try_sched_reset(&ndev->ctrl);
3250 }
3251
3252 static const struct dev_pm_ops nvme_dev_pm_ops = {
3253         .suspend        = nvme_suspend,
3254         .resume         = nvme_resume,
3255         .freeze         = nvme_simple_suspend,
3256         .thaw           = nvme_simple_resume,
3257         .poweroff       = nvme_simple_suspend,
3258         .restore        = nvme_simple_resume,
3259 };
3260 #endif /* CONFIG_PM_SLEEP */
3261
3262 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
3263                                                 pci_channel_state_t state)
3264 {
3265         struct nvme_dev *dev = pci_get_drvdata(pdev);
3266
3267         /*
3268          * A frozen channel requires a reset. When detected, this method will
3269          * shutdown the controller to quiesce. The controller will be restarted
3270          * after the slot reset through driver's slot_reset callback.
3271          */
3272         switch (state) {
3273         case pci_channel_io_normal:
3274                 return PCI_ERS_RESULT_CAN_RECOVER;
3275         case pci_channel_io_frozen:
3276                 dev_warn(dev->ctrl.device,
3277                         "frozen state error detected, reset controller\n");
3278                 nvme_dev_disable(dev, false);
3279                 return PCI_ERS_RESULT_NEED_RESET;
3280         case pci_channel_io_perm_failure:
3281                 dev_warn(dev->ctrl.device,
3282                         "failure state error detected, request disconnect\n");
3283                 return PCI_ERS_RESULT_DISCONNECT;
3284         }
3285         return PCI_ERS_RESULT_NEED_RESET;
3286 }
3287
3288 static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
3289 {
3290         struct nvme_dev *dev = pci_get_drvdata(pdev);
3291
3292         dev_info(dev->ctrl.device, "restart after slot reset\n");
3293         pci_restore_state(pdev);
3294         nvme_reset_ctrl(&dev->ctrl);
3295         return PCI_ERS_RESULT_RECOVERED;
3296 }
3297
3298 static void nvme_error_resume(struct pci_dev *pdev)
3299 {
3300         struct nvme_dev *dev = pci_get_drvdata(pdev);
3301
3302         flush_work(&dev->ctrl.reset_work);
3303 }
3304
3305 static const struct pci_error_handlers nvme_err_handler = {
3306         .error_detected = nvme_error_detected,
3307         .slot_reset     = nvme_slot_reset,
3308         .resume         = nvme_error_resume,
3309         .reset_prepare  = nvme_reset_prepare,
3310         .reset_done     = nvme_reset_done,
3311 };
3312
3313 static const struct pci_device_id nvme_id_table[] = {
3314         { PCI_VDEVICE(INTEL, 0x0953),   /* Intel 750/P3500/P3600/P3700 */
3315                 .driver_data = NVME_QUIRK_STRIPE_SIZE |
3316                                 NVME_QUIRK_DEALLOCATE_ZEROES, },
3317         { PCI_VDEVICE(INTEL, 0x0a53),   /* Intel P3520 */
3318                 .driver_data = NVME_QUIRK_STRIPE_SIZE |
3319                                 NVME_QUIRK_DEALLOCATE_ZEROES, },
3320         { PCI_VDEVICE(INTEL, 0x0a54),   /* Intel P4500/P4600 */
3321                 .driver_data = NVME_QUIRK_STRIPE_SIZE |
3322                                 NVME_QUIRK_DEALLOCATE_ZEROES |
3323                                 NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3324         { PCI_VDEVICE(INTEL, 0x0a55),   /* Dell Express Flash P4600 */
3325                 .driver_data = NVME_QUIRK_STRIPE_SIZE |
3326                                 NVME_QUIRK_DEALLOCATE_ZEROES, },
3327         { PCI_VDEVICE(INTEL, 0xf1a5),   /* Intel 600P/P3100 */
3328                 .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
3329                                 NVME_QUIRK_MEDIUM_PRIO_SQ |
3330                                 NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
3331                                 NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3332         { PCI_VDEVICE(INTEL, 0xf1a6),   /* Intel 760p/Pro 7600p */
3333                 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3334         { PCI_VDEVICE(INTEL, 0x5845),   /* Qemu emulated controller */
3335                 .driver_data = NVME_QUIRK_IDENTIFY_CNS |
3336                                 NVME_QUIRK_DISABLE_WRITE_ZEROES |
3337                                 NVME_QUIRK_BOGUS_NID, },
3338         { PCI_VDEVICE(REDHAT, 0x0010),  /* Qemu emulated controller */
3339                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3340         { PCI_DEVICE(0x126f, 0x2263),   /* Silicon Motion unidentified */
3341                 .driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
3342                                 NVME_QUIRK_BOGUS_NID, },
3343         { PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
3344                 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
3345                                 NVME_QUIRK_NO_NS_DESC_LIST, },
3346         { PCI_DEVICE(0x1c58, 0x0003),   /* HGST adapter */
3347                 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3348         { PCI_DEVICE(0x1c58, 0x0023),   /* WDC SN200 adapter */
3349                 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3350         { PCI_DEVICE(0x1c5f, 0x0540),   /* Memblaze Pblaze4 adapter */
3351                 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3352         { PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
3353                 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3354         { PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
3355                 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
3356                                 NVME_QUIRK_DISABLE_WRITE_ZEROES|
3357                                 NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3358         { PCI_DEVICE(0x1987, 0x5012),   /* Phison E12 */
3359                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3360         { PCI_DEVICE(0x1987, 0x5016),   /* Phison E16 */
3361                 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
3362                                 NVME_QUIRK_BOGUS_NID, },
3363         { PCI_DEVICE(0x1987, 0x5019),  /* phison E19 */
3364                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3365         { PCI_DEVICE(0x1987, 0x5021),   /* Phison E21 */
3366                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3367         { PCI_DEVICE(0x1b4b, 0x1092),   /* Lexar 256 GB SSD */
3368                 .driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
3369                                 NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3370         { PCI_DEVICE(0x1cc1, 0x33f8),   /* ADATA IM2P33F8ABR1 1 TB */
3371                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3372         { PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
3373                 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
3374                                 NVME_QUIRK_BOGUS_NID, },
3375         { PCI_DEVICE(0x10ec, 0x5763),  /* ADATA SX6000PNP */
3376                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3377         { PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
3378                 .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
3379                                 NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3380          { PCI_DEVICE(0x1344, 0x5407), /* Micron Technology Inc NVMe SSD */
3381                 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN },
3382          { PCI_DEVICE(0x1344, 0x6001),   /* Micron Nitro NVMe */
3383                  .driver_data = NVME_QUIRK_BOGUS_NID, },
3384         { PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
3385                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3386         { PCI_DEVICE(0x1c5c, 0x174a),   /* SK Hynix P31 SSD */
3387                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3388         { PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
3389                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3390         { PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
3391                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3392         { PCI_DEVICE(0x144d, 0xa80b),   /* Samsung PM9B1 256G and 512G */
3393                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3394         { PCI_DEVICE(0x144d, 0xa809),   /* Samsung MZALQ256HBJD 256G */
3395                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3396         { PCI_DEVICE(0x1cc4, 0x6303),   /* UMIS RPJTJ512MGE1QDY 512G */
3397                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3398         { PCI_DEVICE(0x1cc4, 0x6302),   /* UMIS RPJTJ256MGE1QDY 256G */
3399                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3400         { PCI_DEVICE(0x2646, 0x2262),   /* KINGSTON SKC2000 NVMe SSD */
3401                 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3402         { PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
3403                 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3404         { PCI_DEVICE(0x2646, 0x5013),   /* Kingston KC3000, Kingston FURY Renegade */
3405                 .driver_data = NVME_QUIRK_NO_SECONDARY_TEMP_THRESH, },
3406         { PCI_DEVICE(0x2646, 0x5018),   /* KINGSTON OM8SFP4xxxxP OS21012 NVMe SSD */
3407                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3408         { PCI_DEVICE(0x2646, 0x5016),   /* KINGSTON OM3PGP4xxxxP OS21011 NVMe SSD */
3409                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3410         { PCI_DEVICE(0x2646, 0x501A),   /* KINGSTON OM8PGP4xxxxP OS21005 NVMe SSD */
3411                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3412         { PCI_DEVICE(0x2646, 0x501B),   /* KINGSTON OM8PGP4xxxxQ OS21005 NVMe SSD */
3413                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3414         { PCI_DEVICE(0x2646, 0x501E),   /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */
3415                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3416         { PCI_DEVICE(0x1f40, 0x5236),   /* Netac Technologies Co. NV7000 NVMe SSD */
3417                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3418         { PCI_DEVICE(0x1e4B, 0x1001),   /* MAXIO MAP1001 */
3419                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3420         { PCI_DEVICE(0x1e4B, 0x1002),   /* MAXIO MAP1002 */
3421                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3422         { PCI_DEVICE(0x1e4B, 0x1202),   /* MAXIO MAP1202 */
3423                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3424         { PCI_DEVICE(0x1cc1, 0x5350),   /* ADATA XPG GAMMIX S50 */
3425                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3426         { PCI_DEVICE(0x1dbe, 0x5236),   /* ADATA XPG GAMMIX S70 */
3427                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3428         { PCI_DEVICE(0x1e49, 0x0021),   /* ZHITAI TiPro5000 NVMe SSD */
3429                 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3430         { PCI_DEVICE(0x1e49, 0x0041),   /* ZHITAI TiPro7000 NVMe SSD */
3431                 .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3432         { PCI_DEVICE(0xc0a9, 0x540a),   /* Crucial P2 */
3433                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3434         { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */
3435                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3436         { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */
3437                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3438         { PCI_DEVICE(0x1e4b, 0x1602), /* HS-SSD-FUTURE 2048G  */
3439                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3440         { PCI_DEVICE(0x10ec, 0x5765), /* TEAMGROUP MP33 2TB SSD */
3441                 .driver_data = NVME_QUIRK_BOGUS_NID, },
3442         { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
3443                 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3444         { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
3445                 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3446         { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061),
3447                 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3448         { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00),
3449                 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3450         { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01),
3451                 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3452         { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
3453                 .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3454         { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
3455                 .driver_data = NVME_QUIRK_SINGLE_VECTOR },
3456         { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
3457         { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
3458                 .driver_data = NVME_QUIRK_SINGLE_VECTOR |
3459                                 NVME_QUIRK_128_BYTES_SQES |
3460                                 NVME_QUIRK_SHARED_TAGS |
3461                                 NVME_QUIRK_SKIP_CID_GEN |
3462                                 NVME_QUIRK_IDENTIFY_CNS },
3463         { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
3464         { 0, }
3465 };
3466 MODULE_DEVICE_TABLE(pci, nvme_id_table);
3467
3468 static struct pci_driver nvme_driver = {
3469         .name           = "nvme",
3470         .id_table       = nvme_id_table,
3471         .probe          = nvme_probe,
3472         .remove         = nvme_remove,
3473         .shutdown       = nvme_shutdown,
3474         .driver         = {
3475                 .probe_type     = PROBE_PREFER_ASYNCHRONOUS,
3476 #ifdef CONFIG_PM_SLEEP
3477                 .pm             = &nvme_dev_pm_ops,
3478 #endif
3479         },
3480         .sriov_configure = pci_sriov_configure_simple,
3481         .err_handler    = &nvme_err_handler,
3482 };
3483
3484 static int __init nvme_init(void)
3485 {
3486         BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
3487         BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
3488         BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
3489         BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
3490         BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
3491         BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
3492         BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
3493
3494         return pci_register_driver(&nvme_driver);
3495 }
3496
3497 static void __exit nvme_exit(void)
3498 {
3499         pci_unregister_driver(&nvme_driver);
3500         flush_workqueue(nvme_wq);
3501 }
3502
3503 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
3504 MODULE_LICENSE("GPL");
3505 MODULE_VERSION("1.0");
3506 module_init(nvme_init);
3507 module_exit(nvme_exit);