nvme: revalidate zone bitmaps in nvme_update_ns_info
[linux-2.6-microblaze.git] / drivers / nvme / host / core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVM Express device driver
4  * Copyright (c) 2011-2014, Intel Corporation.
5  */
6
7 #include <linux/blkdev.h>
8 #include <linux/blk-mq.h>
9 #include <linux/compat.h>
10 #include <linux/delay.h>
11 #include <linux/errno.h>
12 #include <linux/hdreg.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/backing-dev.h>
16 #include <linux/list_sort.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/pr.h>
20 #include <linux/ptrace.h>
21 #include <linux/nvme_ioctl.h>
22 #include <linux/pm_qos.h>
23 #include <asm/unaligned.h>
24
25 #include "nvme.h"
26 #include "fabrics.h"
27
28 #define CREATE_TRACE_POINTS
29 #include "trace.h"
30
31 #define NVME_MINORS             (1U << MINORBITS)
32
33 unsigned int admin_timeout = 60;
34 module_param(admin_timeout, uint, 0644);
35 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
36 EXPORT_SYMBOL_GPL(admin_timeout);
37
38 unsigned int nvme_io_timeout = 30;
39 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
40 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
41 EXPORT_SYMBOL_GPL(nvme_io_timeout);
42
43 static unsigned char shutdown_timeout = 5;
44 module_param(shutdown_timeout, byte, 0644);
45 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
46
47 static u8 nvme_max_retries = 5;
48 module_param_named(max_retries, nvme_max_retries, byte, 0644);
49 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
50
51 static unsigned long default_ps_max_latency_us = 100000;
52 module_param(default_ps_max_latency_us, ulong, 0644);
53 MODULE_PARM_DESC(default_ps_max_latency_us,
54                  "max power saving latency for new devices; use PM QOS to change per device");
55
56 static bool force_apst;
57 module_param(force_apst, bool, 0644);
58 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
59
60 static bool streams;
61 module_param(streams, bool, 0644);
62 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
63
64 /*
65  * nvme_wq - hosts nvme related works that are not reset or delete
66  * nvme_reset_wq - hosts nvme reset works
67  * nvme_delete_wq - hosts nvme delete works
68  *
69  * nvme_wq will host works such as scan, aen handling, fw activation,
70  * keep-alive, periodic reconnects etc. nvme_reset_wq
71  * runs reset works which also flush works hosted on nvme_wq for
72  * serialization purposes. nvme_delete_wq host controller deletion
73  * works which flush reset works for serialization.
74  */
75 struct workqueue_struct *nvme_wq;
76 EXPORT_SYMBOL_GPL(nvme_wq);
77
78 struct workqueue_struct *nvme_reset_wq;
79 EXPORT_SYMBOL_GPL(nvme_reset_wq);
80
81 struct workqueue_struct *nvme_delete_wq;
82 EXPORT_SYMBOL_GPL(nvme_delete_wq);
83
84 static LIST_HEAD(nvme_subsystems);
85 static DEFINE_MUTEX(nvme_subsystems_lock);
86
87 static DEFINE_IDA(nvme_instance_ida);
88 static dev_t nvme_chr_devt;
89 static struct class *nvme_class;
90 static struct class *nvme_subsys_class;
91
92 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
93 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
94                                            unsigned nsid);
95
96 static void nvme_update_bdev_size(struct gendisk *disk)
97 {
98         struct block_device *bdev = bdget_disk(disk, 0);
99
100         if (bdev) {
101                 bd_set_nr_sectors(bdev, get_capacity(disk));
102                 bdput(bdev);
103         }
104 }
105
106 /*
107  * Prepare a queue for teardown.
108  *
109  * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
110  * the capacity to 0 after that to avoid blocking dispatchers that may be
111  * holding bd_butex.  This will end buffered writers dirtying pages that can't
112  * be synced.
113  */
114 static void nvme_set_queue_dying(struct nvme_ns *ns)
115 {
116         if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
117                 return;
118
119         blk_set_queue_dying(ns->queue);
120         blk_mq_unquiesce_queue(ns->queue);
121
122         set_capacity(ns->disk, 0);
123         nvme_update_bdev_size(ns->disk);
124 }
125
126 static void nvme_queue_scan(struct nvme_ctrl *ctrl)
127 {
128         /*
129          * Only new queue scan work when admin and IO queues are both alive
130          */
131         if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
132                 queue_work(nvme_wq, &ctrl->scan_work);
133 }
134
135 /*
136  * Use this function to proceed with scheduling reset_work for a controller
137  * that had previously been set to the resetting state. This is intended for
138  * code paths that can't be interrupted by other reset attempts. A hot removal
139  * may prevent this from succeeding.
140  */
141 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
142 {
143         if (ctrl->state != NVME_CTRL_RESETTING)
144                 return -EBUSY;
145         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
146                 return -EBUSY;
147         return 0;
148 }
149 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
150
151 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
152 {
153         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
154                 return -EBUSY;
155         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
156                 return -EBUSY;
157         return 0;
158 }
159 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
160
161 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
162 {
163         int ret;
164
165         ret = nvme_reset_ctrl(ctrl);
166         if (!ret) {
167                 flush_work(&ctrl->reset_work);
168                 if (ctrl->state != NVME_CTRL_LIVE)
169                         ret = -ENETRESET;
170         }
171
172         return ret;
173 }
174 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
175
176 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
177 {
178         dev_info(ctrl->device,
179                  "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
180
181         flush_work(&ctrl->reset_work);
182         nvme_stop_ctrl(ctrl);
183         nvme_remove_namespaces(ctrl);
184         ctrl->ops->delete_ctrl(ctrl);
185         nvme_uninit_ctrl(ctrl);
186 }
187
188 static void nvme_delete_ctrl_work(struct work_struct *work)
189 {
190         struct nvme_ctrl *ctrl =
191                 container_of(work, struct nvme_ctrl, delete_work);
192
193         nvme_do_delete_ctrl(ctrl);
194 }
195
196 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
197 {
198         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
199                 return -EBUSY;
200         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
201                 return -EBUSY;
202         return 0;
203 }
204 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
205
206 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
207 {
208         /*
209          * Keep a reference until nvme_do_delete_ctrl() complete,
210          * since ->delete_ctrl can free the controller.
211          */
212         nvme_get_ctrl(ctrl);
213         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
214                 nvme_do_delete_ctrl(ctrl);
215         nvme_put_ctrl(ctrl);
216 }
217
218 static blk_status_t nvme_error_status(u16 status)
219 {
220         switch (status & 0x7ff) {
221         case NVME_SC_SUCCESS:
222                 return BLK_STS_OK;
223         case NVME_SC_CAP_EXCEEDED:
224                 return BLK_STS_NOSPC;
225         case NVME_SC_LBA_RANGE:
226         case NVME_SC_CMD_INTERRUPTED:
227         case NVME_SC_NS_NOT_READY:
228                 return BLK_STS_TARGET;
229         case NVME_SC_BAD_ATTRIBUTES:
230         case NVME_SC_ONCS_NOT_SUPPORTED:
231         case NVME_SC_INVALID_OPCODE:
232         case NVME_SC_INVALID_FIELD:
233         case NVME_SC_INVALID_NS:
234                 return BLK_STS_NOTSUPP;
235         case NVME_SC_WRITE_FAULT:
236         case NVME_SC_READ_ERROR:
237         case NVME_SC_UNWRITTEN_BLOCK:
238         case NVME_SC_ACCESS_DENIED:
239         case NVME_SC_READ_ONLY:
240         case NVME_SC_COMPARE_FAILED:
241                 return BLK_STS_MEDIUM;
242         case NVME_SC_GUARD_CHECK:
243         case NVME_SC_APPTAG_CHECK:
244         case NVME_SC_REFTAG_CHECK:
245         case NVME_SC_INVALID_PI:
246                 return BLK_STS_PROTECTION;
247         case NVME_SC_RESERVATION_CONFLICT:
248                 return BLK_STS_NEXUS;
249         case NVME_SC_HOST_PATH_ERROR:
250                 return BLK_STS_TRANSPORT;
251         default:
252                 return BLK_STS_IOERR;
253         }
254 }
255
256 static void nvme_retry_req(struct request *req)
257 {
258         struct nvme_ns *ns = req->q->queuedata;
259         unsigned long delay = 0;
260         u16 crd;
261
262         /* The mask and shift result must be <= 3 */
263         crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
264         if (ns && crd)
265                 delay = ns->ctrl->crdt[crd - 1] * 100;
266
267         nvme_req(req)->retries++;
268         blk_mq_requeue_request(req, false);
269         blk_mq_delay_kick_requeue_list(req->q, delay);
270 }
271
272 enum nvme_disposition {
273         COMPLETE,
274         RETRY,
275         FAILOVER,
276 };
277
278 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
279 {
280         if (likely(nvme_req(req)->status == 0))
281                 return COMPLETE;
282
283         if (blk_noretry_request(req) ||
284             (nvme_req(req)->status & NVME_SC_DNR) ||
285             nvme_req(req)->retries >= nvme_max_retries)
286                 return COMPLETE;
287
288         if (req->cmd_flags & REQ_NVME_MPATH) {
289                 if (nvme_is_path_error(nvme_req(req)->status) ||
290                     blk_queue_dying(req->q))
291                         return FAILOVER;
292         } else {
293                 if (blk_queue_dying(req->q))
294                         return COMPLETE;
295         }
296
297         return RETRY;
298 }
299
300 static inline void nvme_end_req(struct request *req)
301 {
302         blk_status_t status = nvme_error_status(nvme_req(req)->status);
303
304         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
305             req_op(req) == REQ_OP_ZONE_APPEND)
306                 req->__sector = nvme_lba_to_sect(req->q->queuedata,
307                         le64_to_cpu(nvme_req(req)->result.u64));
308
309         nvme_trace_bio_complete(req, status);
310         blk_mq_end_request(req, status);
311 }
312
313 void nvme_complete_rq(struct request *req)
314 {
315         trace_nvme_complete_rq(req);
316         nvme_cleanup_cmd(req);
317
318         if (nvme_req(req)->ctrl->kas)
319                 nvme_req(req)->ctrl->comp_seen = true;
320
321         switch (nvme_decide_disposition(req)) {
322         case COMPLETE:
323                 nvme_end_req(req);
324                 return;
325         case RETRY:
326                 nvme_retry_req(req);
327                 return;
328         case FAILOVER:
329                 nvme_failover_req(req);
330                 return;
331         }
332 }
333 EXPORT_SYMBOL_GPL(nvme_complete_rq);
334
335 bool nvme_cancel_request(struct request *req, void *data, bool reserved)
336 {
337         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
338                                 "Cancelling I/O %d", req->tag);
339
340         /* don't abort one completed request */
341         if (blk_mq_request_completed(req))
342                 return true;
343
344         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
345         blk_mq_complete_request(req);
346         return true;
347 }
348 EXPORT_SYMBOL_GPL(nvme_cancel_request);
349
350 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
351                 enum nvme_ctrl_state new_state)
352 {
353         enum nvme_ctrl_state old_state;
354         unsigned long flags;
355         bool changed = false;
356
357         spin_lock_irqsave(&ctrl->lock, flags);
358
359         old_state = ctrl->state;
360         switch (new_state) {
361         case NVME_CTRL_LIVE:
362                 switch (old_state) {
363                 case NVME_CTRL_NEW:
364                 case NVME_CTRL_RESETTING:
365                 case NVME_CTRL_CONNECTING:
366                         changed = true;
367                         fallthrough;
368                 default:
369                         break;
370                 }
371                 break;
372         case NVME_CTRL_RESETTING:
373                 switch (old_state) {
374                 case NVME_CTRL_NEW:
375                 case NVME_CTRL_LIVE:
376                         changed = true;
377                         fallthrough;
378                 default:
379                         break;
380                 }
381                 break;
382         case NVME_CTRL_CONNECTING:
383                 switch (old_state) {
384                 case NVME_CTRL_NEW:
385                 case NVME_CTRL_RESETTING:
386                         changed = true;
387                         fallthrough;
388                 default:
389                         break;
390                 }
391                 break;
392         case NVME_CTRL_DELETING:
393                 switch (old_state) {
394                 case NVME_CTRL_LIVE:
395                 case NVME_CTRL_RESETTING:
396                 case NVME_CTRL_CONNECTING:
397                         changed = true;
398                         fallthrough;
399                 default:
400                         break;
401                 }
402                 break;
403         case NVME_CTRL_DELETING_NOIO:
404                 switch (old_state) {
405                 case NVME_CTRL_DELETING:
406                 case NVME_CTRL_DEAD:
407                         changed = true;
408                         fallthrough;
409                 default:
410                         break;
411                 }
412                 break;
413         case NVME_CTRL_DEAD:
414                 switch (old_state) {
415                 case NVME_CTRL_DELETING:
416                         changed = true;
417                         fallthrough;
418                 default:
419                         break;
420                 }
421                 break;
422         default:
423                 break;
424         }
425
426         if (changed) {
427                 ctrl->state = new_state;
428                 wake_up_all(&ctrl->state_wq);
429         }
430
431         spin_unlock_irqrestore(&ctrl->lock, flags);
432         if (changed && ctrl->state == NVME_CTRL_LIVE)
433                 nvme_kick_requeue_lists(ctrl);
434         return changed;
435 }
436 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
437
438 /*
439  * Returns true for sink states that can't ever transition back to live.
440  */
441 static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
442 {
443         switch (ctrl->state) {
444         case NVME_CTRL_NEW:
445         case NVME_CTRL_LIVE:
446         case NVME_CTRL_RESETTING:
447         case NVME_CTRL_CONNECTING:
448                 return false;
449         case NVME_CTRL_DELETING:
450         case NVME_CTRL_DELETING_NOIO:
451         case NVME_CTRL_DEAD:
452                 return true;
453         default:
454                 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
455                 return true;
456         }
457 }
458
459 /*
460  * Waits for the controller state to be resetting, or returns false if it is
461  * not possible to ever transition to that state.
462  */
463 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
464 {
465         wait_event(ctrl->state_wq,
466                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
467                    nvme_state_terminal(ctrl));
468         return ctrl->state == NVME_CTRL_RESETTING;
469 }
470 EXPORT_SYMBOL_GPL(nvme_wait_reset);
471
472 static void nvme_free_ns_head(struct kref *ref)
473 {
474         struct nvme_ns_head *head =
475                 container_of(ref, struct nvme_ns_head, ref);
476
477         nvme_mpath_remove_disk(head);
478         ida_simple_remove(&head->subsys->ns_ida, head->instance);
479         cleanup_srcu_struct(&head->srcu);
480         nvme_put_subsystem(head->subsys);
481         kfree(head);
482 }
483
484 static void nvme_put_ns_head(struct nvme_ns_head *head)
485 {
486         kref_put(&head->ref, nvme_free_ns_head);
487 }
488
489 static void nvme_free_ns(struct kref *kref)
490 {
491         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
492
493         if (ns->ndev)
494                 nvme_nvm_unregister(ns);
495
496         put_disk(ns->disk);
497         nvme_put_ns_head(ns->head);
498         nvme_put_ctrl(ns->ctrl);
499         kfree(ns);
500 }
501
502 void nvme_put_ns(struct nvme_ns *ns)
503 {
504         kref_put(&ns->kref, nvme_free_ns);
505 }
506 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
507
508 static inline void nvme_clear_nvme_request(struct request *req)
509 {
510         if (!(req->rq_flags & RQF_DONTPREP)) {
511                 nvme_req(req)->retries = 0;
512                 nvme_req(req)->flags = 0;
513                 req->rq_flags |= RQF_DONTPREP;
514         }
515 }
516
517 struct request *nvme_alloc_request(struct request_queue *q,
518                 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
519 {
520         unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
521         struct request *req;
522
523         if (qid == NVME_QID_ANY) {
524                 req = blk_mq_alloc_request(q, op, flags);
525         } else {
526                 req = blk_mq_alloc_request_hctx(q, op, flags,
527                                 qid ? qid - 1 : 0);
528         }
529         if (IS_ERR(req))
530                 return req;
531
532         req->cmd_flags |= REQ_FAILFAST_DRIVER;
533         nvme_clear_nvme_request(req);
534         nvme_req(req)->cmd = cmd;
535
536         return req;
537 }
538 EXPORT_SYMBOL_GPL(nvme_alloc_request);
539
540 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
541 {
542         struct nvme_command c;
543
544         memset(&c, 0, sizeof(c));
545
546         c.directive.opcode = nvme_admin_directive_send;
547         c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
548         c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
549         c.directive.dtype = NVME_DIR_IDENTIFY;
550         c.directive.tdtype = NVME_DIR_STREAMS;
551         c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
552
553         return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
554 }
555
556 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
557 {
558         return nvme_toggle_streams(ctrl, false);
559 }
560
561 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
562 {
563         return nvme_toggle_streams(ctrl, true);
564 }
565
566 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
567                                   struct streams_directive_params *s, u32 nsid)
568 {
569         struct nvme_command c;
570
571         memset(&c, 0, sizeof(c));
572         memset(s, 0, sizeof(*s));
573
574         c.directive.opcode = nvme_admin_directive_recv;
575         c.directive.nsid = cpu_to_le32(nsid);
576         c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
577         c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
578         c.directive.dtype = NVME_DIR_STREAMS;
579
580         return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
581 }
582
583 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
584 {
585         struct streams_directive_params s;
586         int ret;
587
588         if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
589                 return 0;
590         if (!streams)
591                 return 0;
592
593         ret = nvme_enable_streams(ctrl);
594         if (ret)
595                 return ret;
596
597         ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
598         if (ret)
599                 goto out_disable_stream;
600
601         ctrl->nssa = le16_to_cpu(s.nssa);
602         if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
603                 dev_info(ctrl->device, "too few streams (%u) available\n",
604                                         ctrl->nssa);
605                 goto out_disable_stream;
606         }
607
608         ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
609         dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
610         return 0;
611
612 out_disable_stream:
613         nvme_disable_streams(ctrl);
614         return ret;
615 }
616
617 /*
618  * Check if 'req' has a write hint associated with it. If it does, assign
619  * a valid namespace stream to the write.
620  */
621 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
622                                      struct request *req, u16 *control,
623                                      u32 *dsmgmt)
624 {
625         enum rw_hint streamid = req->write_hint;
626
627         if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
628                 streamid = 0;
629         else {
630                 streamid--;
631                 if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
632                         return;
633
634                 *control |= NVME_RW_DTYPE_STREAMS;
635                 *dsmgmt |= streamid << 16;
636         }
637
638         if (streamid < ARRAY_SIZE(req->q->write_hints))
639                 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
640 }
641
642 static void nvme_setup_passthrough(struct request *req,
643                 struct nvme_command *cmd)
644 {
645         memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
646         /* passthru commands should let the driver set the SGL flags */
647         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
648 }
649
650 static inline void nvme_setup_flush(struct nvme_ns *ns,
651                 struct nvme_command *cmnd)
652 {
653         cmnd->common.opcode = nvme_cmd_flush;
654         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
655 }
656
657 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
658                 struct nvme_command *cmnd)
659 {
660         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
661         struct nvme_dsm_range *range;
662         struct bio *bio;
663
664         /*
665          * Some devices do not consider the DSM 'Number of Ranges' field when
666          * determining how much data to DMA. Always allocate memory for maximum
667          * number of segments to prevent device reading beyond end of buffer.
668          */
669         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
670
671         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
672         if (!range) {
673                 /*
674                  * If we fail allocation our range, fallback to the controller
675                  * discard page. If that's also busy, it's safe to return
676                  * busy, as we know we can make progress once that's freed.
677                  */
678                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
679                         return BLK_STS_RESOURCE;
680
681                 range = page_address(ns->ctrl->discard_page);
682         }
683
684         __rq_for_each_bio(bio, req) {
685                 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
686                 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
687
688                 if (n < segments) {
689                         range[n].cattr = cpu_to_le32(0);
690                         range[n].nlb = cpu_to_le32(nlb);
691                         range[n].slba = cpu_to_le64(slba);
692                 }
693                 n++;
694         }
695
696         if (WARN_ON_ONCE(n != segments)) {
697                 if (virt_to_page(range) == ns->ctrl->discard_page)
698                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
699                 else
700                         kfree(range);
701                 return BLK_STS_IOERR;
702         }
703
704         cmnd->dsm.opcode = nvme_cmd_dsm;
705         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
706         cmnd->dsm.nr = cpu_to_le32(segments - 1);
707         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
708
709         req->special_vec.bv_page = virt_to_page(range);
710         req->special_vec.bv_offset = offset_in_page(range);
711         req->special_vec.bv_len = alloc_size;
712         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
713
714         return BLK_STS_OK;
715 }
716
717 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
718                 struct request *req, struct nvme_command *cmnd)
719 {
720         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
721                 return nvme_setup_discard(ns, req, cmnd);
722
723         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
724         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
725         cmnd->write_zeroes.slba =
726                 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
727         cmnd->write_zeroes.length =
728                 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
729         cmnd->write_zeroes.control = 0;
730         return BLK_STS_OK;
731 }
732
733 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
734                 struct request *req, struct nvme_command *cmnd,
735                 enum nvme_opcode op)
736 {
737         struct nvme_ctrl *ctrl = ns->ctrl;
738         u16 control = 0;
739         u32 dsmgmt = 0;
740
741         if (req->cmd_flags & REQ_FUA)
742                 control |= NVME_RW_FUA;
743         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
744                 control |= NVME_RW_LR;
745
746         if (req->cmd_flags & REQ_RAHEAD)
747                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
748
749         cmnd->rw.opcode = op;
750         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
751         cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
752         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
753
754         if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
755                 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
756
757         if (ns->ms) {
758                 /*
759                  * If formated with metadata, the block layer always provides a
760                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
761                  * we enable the PRACT bit for protection information or set the
762                  * namespace capacity to zero to prevent any I/O.
763                  */
764                 if (!blk_integrity_rq(req)) {
765                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
766                                 return BLK_STS_NOTSUPP;
767                         control |= NVME_RW_PRINFO_PRACT;
768                 }
769
770                 switch (ns->pi_type) {
771                 case NVME_NS_DPS_PI_TYPE3:
772                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
773                         break;
774                 case NVME_NS_DPS_PI_TYPE1:
775                 case NVME_NS_DPS_PI_TYPE2:
776                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
777                                         NVME_RW_PRINFO_PRCHK_REF;
778                         if (op == nvme_cmd_zone_append)
779                                 control |= NVME_RW_APPEND_PIREMAP;
780                         cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
781                         break;
782                 }
783         }
784
785         cmnd->rw.control = cpu_to_le16(control);
786         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
787         return 0;
788 }
789
790 void nvme_cleanup_cmd(struct request *req)
791 {
792         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
793                 struct nvme_ns *ns = req->rq_disk->private_data;
794                 struct page *page = req->special_vec.bv_page;
795
796                 if (page == ns->ctrl->discard_page)
797                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
798                 else
799                         kfree(page_address(page) + req->special_vec.bv_offset);
800         }
801 }
802 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
803
804 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
805                 struct nvme_command *cmd)
806 {
807         blk_status_t ret = BLK_STS_OK;
808
809         nvme_clear_nvme_request(req);
810
811         memset(cmd, 0, sizeof(*cmd));
812         switch (req_op(req)) {
813         case REQ_OP_DRV_IN:
814         case REQ_OP_DRV_OUT:
815                 nvme_setup_passthrough(req, cmd);
816                 break;
817         case REQ_OP_FLUSH:
818                 nvme_setup_flush(ns, cmd);
819                 break;
820         case REQ_OP_ZONE_RESET_ALL:
821         case REQ_OP_ZONE_RESET:
822                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
823                 break;
824         case REQ_OP_ZONE_OPEN:
825                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
826                 break;
827         case REQ_OP_ZONE_CLOSE:
828                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
829                 break;
830         case REQ_OP_ZONE_FINISH:
831                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
832                 break;
833         case REQ_OP_WRITE_ZEROES:
834                 ret = nvme_setup_write_zeroes(ns, req, cmd);
835                 break;
836         case REQ_OP_DISCARD:
837                 ret = nvme_setup_discard(ns, req, cmd);
838                 break;
839         case REQ_OP_READ:
840                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
841                 break;
842         case REQ_OP_WRITE:
843                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
844                 break;
845         case REQ_OP_ZONE_APPEND:
846                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
847                 break;
848         default:
849                 WARN_ON_ONCE(1);
850                 return BLK_STS_IOERR;
851         }
852
853         cmd->common.command_id = req->tag;
854         trace_nvme_setup_cmd(req, cmd);
855         return ret;
856 }
857 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
858
859 static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
860 {
861         struct completion *waiting = rq->end_io_data;
862
863         rq->end_io_data = NULL;
864         complete(waiting);
865 }
866
867 static void nvme_execute_rq_polled(struct request_queue *q,
868                 struct gendisk *bd_disk, struct request *rq, int at_head)
869 {
870         DECLARE_COMPLETION_ONSTACK(wait);
871
872         WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
873
874         rq->cmd_flags |= REQ_HIPRI;
875         rq->end_io_data = &wait;
876         blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
877
878         while (!completion_done(&wait)) {
879                 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
880                 cond_resched();
881         }
882 }
883
884 /*
885  * Returns 0 on success.  If the result is negative, it's a Linux error code;
886  * if the result is positive, it's an NVM Express status code
887  */
888 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
889                 union nvme_result *result, void *buffer, unsigned bufflen,
890                 unsigned timeout, int qid, int at_head,
891                 blk_mq_req_flags_t flags, bool poll)
892 {
893         struct request *req;
894         int ret;
895
896         req = nvme_alloc_request(q, cmd, flags, qid);
897         if (IS_ERR(req))
898                 return PTR_ERR(req);
899
900         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
901
902         if (buffer && bufflen) {
903                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
904                 if (ret)
905                         goto out;
906         }
907
908         if (poll)
909                 nvme_execute_rq_polled(req->q, NULL, req, at_head);
910         else
911                 blk_execute_rq(req->q, NULL, req, at_head);
912         if (result)
913                 *result = nvme_req(req)->result;
914         if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
915                 ret = -EINTR;
916         else
917                 ret = nvme_req(req)->status;
918  out:
919         blk_mq_free_request(req);
920         return ret;
921 }
922 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
923
924 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
925                 void *buffer, unsigned bufflen)
926 {
927         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
928                         NVME_QID_ANY, 0, 0, false);
929 }
930 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
931
932 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
933                 unsigned len, u32 seed, bool write)
934 {
935         struct bio_integrity_payload *bip;
936         int ret = -ENOMEM;
937         void *buf;
938
939         buf = kmalloc(len, GFP_KERNEL);
940         if (!buf)
941                 goto out;
942
943         ret = -EFAULT;
944         if (write && copy_from_user(buf, ubuf, len))
945                 goto out_free_meta;
946
947         bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
948         if (IS_ERR(bip)) {
949                 ret = PTR_ERR(bip);
950                 goto out_free_meta;
951         }
952
953         bip->bip_iter.bi_size = len;
954         bip->bip_iter.bi_sector = seed;
955         ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
956                         offset_in_page(buf));
957         if (ret == len)
958                 return buf;
959         ret = -ENOMEM;
960 out_free_meta:
961         kfree(buf);
962 out:
963         return ERR_PTR(ret);
964 }
965
966 static u32 nvme_known_admin_effects(u8 opcode)
967 {
968         switch (opcode) {
969         case nvme_admin_format_nvm:
970                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
971                         NVME_CMD_EFFECTS_CSE_MASK;
972         case nvme_admin_sanitize_nvm:
973                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
974         default:
975                 break;
976         }
977         return 0;
978 }
979
980 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
981 {
982         u32 effects = 0;
983
984         if (ns) {
985                 if (ns->head->effects)
986                         effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
987                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
988                         dev_warn(ctrl->device,
989                                  "IO command:%02x has unhandled effects:%08x\n",
990                                  opcode, effects);
991                 return 0;
992         }
993
994         if (ctrl->effects)
995                 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
996         effects |= nvme_known_admin_effects(opcode);
997
998         return effects;
999 }
1000 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1001
1002 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1003                                u8 opcode)
1004 {
1005         u32 effects = nvme_command_effects(ctrl, ns, opcode);
1006
1007         /*
1008          * For simplicity, IO to all namespaces is quiesced even if the command
1009          * effects say only one namespace is affected.
1010          */
1011         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1012                 mutex_lock(&ctrl->scan_lock);
1013                 mutex_lock(&ctrl->subsys->lock);
1014                 nvme_mpath_start_freeze(ctrl->subsys);
1015                 nvme_mpath_wait_freeze(ctrl->subsys);
1016                 nvme_start_freeze(ctrl);
1017                 nvme_wait_freeze(ctrl);
1018         }
1019         return effects;
1020 }
1021
1022 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1023 {
1024         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1025                 nvme_unfreeze(ctrl);
1026                 nvme_mpath_unfreeze(ctrl->subsys);
1027                 mutex_unlock(&ctrl->subsys->lock);
1028                 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1029                 mutex_unlock(&ctrl->scan_lock);
1030         }
1031         if (effects & NVME_CMD_EFFECTS_CCC)
1032                 nvme_init_identify(ctrl);
1033         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1034                 nvme_queue_scan(ctrl);
1035                 flush_work(&ctrl->scan_work);
1036         }
1037 }
1038
1039 void nvme_execute_passthru_rq(struct request *rq)
1040 {
1041         struct nvme_command *cmd = nvme_req(rq)->cmd;
1042         struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1043         struct nvme_ns *ns = rq->q->queuedata;
1044         struct gendisk *disk = ns ? ns->disk : NULL;
1045         u32 effects;
1046
1047         effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1048         blk_execute_rq(rq->q, disk, rq, 0);
1049         nvme_passthru_end(ctrl, effects);
1050 }
1051 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1052
1053 static int nvme_submit_user_cmd(struct request_queue *q,
1054                 struct nvme_command *cmd, void __user *ubuffer,
1055                 unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
1056                 u32 meta_seed, u64 *result, unsigned timeout)
1057 {
1058         bool write = nvme_is_write(cmd);
1059         struct nvme_ns *ns = q->queuedata;
1060         struct gendisk *disk = ns ? ns->disk : NULL;
1061         struct request *req;
1062         struct bio *bio = NULL;
1063         void *meta = NULL;
1064         int ret;
1065
1066         req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
1067         if (IS_ERR(req))
1068                 return PTR_ERR(req);
1069
1070         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
1071         nvme_req(req)->flags |= NVME_REQ_USERCMD;
1072
1073         if (ubuffer && bufflen) {
1074                 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
1075                                 GFP_KERNEL);
1076                 if (ret)
1077                         goto out;
1078                 bio = req->bio;
1079                 bio->bi_disk = disk;
1080                 if (disk && meta_buffer && meta_len) {
1081                         meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
1082                                         meta_seed, write);
1083                         if (IS_ERR(meta)) {
1084                                 ret = PTR_ERR(meta);
1085                                 goto out_unmap;
1086                         }
1087                         req->cmd_flags |= REQ_INTEGRITY;
1088                 }
1089         }
1090
1091         nvme_execute_passthru_rq(req);
1092         if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
1093                 ret = -EINTR;
1094         else
1095                 ret = nvme_req(req)->status;
1096         if (result)
1097                 *result = le64_to_cpu(nvme_req(req)->result.u64);
1098         if (meta && !ret && !write) {
1099                 if (copy_to_user(meta_buffer, meta, meta_len))
1100                         ret = -EFAULT;
1101         }
1102         kfree(meta);
1103  out_unmap:
1104         if (bio)
1105                 blk_rq_unmap_user(bio);
1106  out:
1107         blk_mq_free_request(req);
1108         return ret;
1109 }
1110
1111 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
1112 {
1113         struct nvme_ctrl *ctrl = rq->end_io_data;
1114         unsigned long flags;
1115         bool startka = false;
1116
1117         blk_mq_free_request(rq);
1118
1119         if (status) {
1120                 dev_err(ctrl->device,
1121                         "failed nvme_keep_alive_end_io error=%d\n",
1122                                 status);
1123                 return;
1124         }
1125
1126         ctrl->comp_seen = false;
1127         spin_lock_irqsave(&ctrl->lock, flags);
1128         if (ctrl->state == NVME_CTRL_LIVE ||
1129             ctrl->state == NVME_CTRL_CONNECTING)
1130                 startka = true;
1131         spin_unlock_irqrestore(&ctrl->lock, flags);
1132         if (startka)
1133                 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1134 }
1135
1136 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
1137 {
1138         struct request *rq;
1139
1140         rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
1141                         NVME_QID_ANY);
1142         if (IS_ERR(rq))
1143                 return PTR_ERR(rq);
1144
1145         rq->timeout = ctrl->kato * HZ;
1146         rq->end_io_data = ctrl;
1147
1148         blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
1149
1150         return 0;
1151 }
1152
1153 static void nvme_keep_alive_work(struct work_struct *work)
1154 {
1155         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1156                         struct nvme_ctrl, ka_work);
1157         bool comp_seen = ctrl->comp_seen;
1158
1159         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1160                 dev_dbg(ctrl->device,
1161                         "reschedule traffic based keep-alive timer\n");
1162                 ctrl->comp_seen = false;
1163                 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1164                 return;
1165         }
1166
1167         if (nvme_keep_alive(ctrl)) {
1168                 /* allocation failure, reset the controller */
1169                 dev_err(ctrl->device, "keep-alive failed\n");
1170                 nvme_reset_ctrl(ctrl);
1171                 return;
1172         }
1173 }
1174
1175 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1176 {
1177         if (unlikely(ctrl->kato == 0))
1178                 return;
1179
1180         queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1181 }
1182
1183 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1184 {
1185         if (unlikely(ctrl->kato == 0))
1186                 return;
1187
1188         cancel_delayed_work_sync(&ctrl->ka_work);
1189 }
1190 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1191
1192 /*
1193  * In NVMe 1.0 the CNS field was just a binary controller or namespace
1194  * flag, thus sending any new CNS opcodes has a big chance of not working.
1195  * Qemu unfortunately had that bug after reporting a 1.1 version compliance
1196  * (but not for any later version).
1197  */
1198 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
1199 {
1200         if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
1201                 return ctrl->vs < NVME_VS(1, 2, 0);
1202         return ctrl->vs < NVME_VS(1, 1, 0);
1203 }
1204
1205 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1206 {
1207         struct nvme_command c = { };
1208         int error;
1209
1210         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1211         c.identify.opcode = nvme_admin_identify;
1212         c.identify.cns = NVME_ID_CNS_CTRL;
1213
1214         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1215         if (!*id)
1216                 return -ENOMEM;
1217
1218         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1219                         sizeof(struct nvme_id_ctrl));
1220         if (error)
1221                 kfree(*id);
1222         return error;
1223 }
1224
1225 static bool nvme_multi_css(struct nvme_ctrl *ctrl)
1226 {
1227         return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
1228 }
1229
1230 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1231                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1232 {
1233         const char *warn_str = "ctrl returned bogus length:";
1234         void *data = cur;
1235
1236         switch (cur->nidt) {
1237         case NVME_NIDT_EUI64:
1238                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1239                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1240                                  warn_str, cur->nidl);
1241                         return -1;
1242                 }
1243                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1244                 return NVME_NIDT_EUI64_LEN;
1245         case NVME_NIDT_NGUID:
1246                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1247                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1248                                  warn_str, cur->nidl);
1249                         return -1;
1250                 }
1251                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1252                 return NVME_NIDT_NGUID_LEN;
1253         case NVME_NIDT_UUID:
1254                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1255                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1256                                  warn_str, cur->nidl);
1257                         return -1;
1258                 }
1259                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1260                 return NVME_NIDT_UUID_LEN;
1261         case NVME_NIDT_CSI:
1262                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1263                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1264                                  warn_str, cur->nidl);
1265                         return -1;
1266                 }
1267                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1268                 *csi_seen = true;
1269                 return NVME_NIDT_CSI_LEN;
1270         default:
1271                 /* Skip unknown types */
1272                 return cur->nidl;
1273         }
1274 }
1275
1276 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
1277                 struct nvme_ns_ids *ids)
1278 {
1279         struct nvme_command c = { };
1280         bool csi_seen = false;
1281         int status, pos, len;
1282         void *data;
1283
1284         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1285                 return 0;
1286
1287         c.identify.opcode = nvme_admin_identify;
1288         c.identify.nsid = cpu_to_le32(nsid);
1289         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1290
1291         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1292         if (!data)
1293                 return -ENOMEM;
1294
1295         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1296                                       NVME_IDENTIFY_DATA_SIZE);
1297         if (status) {
1298                 dev_warn(ctrl->device,
1299                         "Identify Descriptors failed (%d)\n", status);
1300                 goto free_data;
1301         }
1302
1303         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1304                 struct nvme_ns_id_desc *cur = data + pos;
1305
1306                 if (cur->nidl == 0)
1307                         break;
1308
1309                 len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
1310                 if (len < 0)
1311                         break;
1312
1313                 len += sizeof(*cur);
1314         }
1315
1316         if (nvme_multi_css(ctrl) && !csi_seen) {
1317                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1318                          nsid);
1319                 status = -EINVAL;
1320         }
1321
1322 free_data:
1323         kfree(data);
1324         return status;
1325 }
1326
1327 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
1328 {
1329         struct nvme_command c = { };
1330
1331         c.identify.opcode = nvme_admin_identify;
1332         c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
1333         c.identify.nsid = cpu_to_le32(nsid);
1334         return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
1335                                     NVME_IDENTIFY_DATA_SIZE);
1336 }
1337
1338 static int nvme_identify_ns(struct nvme_ctrl *ctrl,
1339                 unsigned nsid, struct nvme_id_ns **id)
1340 {
1341         struct nvme_command c = { };
1342         int error;
1343
1344         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1345         c.identify.opcode = nvme_admin_identify;
1346         c.identify.nsid = cpu_to_le32(nsid);
1347         c.identify.cns = NVME_ID_CNS_NS;
1348
1349         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1350         if (!*id)
1351                 return -ENOMEM;
1352
1353         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1354         if (error) {
1355                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1356                 goto out_free_id;
1357         }
1358
1359         error = -ENODEV;
1360         if ((*id)->ncap == 0) /* namespace not allocated or attached */
1361                 goto out_free_id;
1362         return 0;
1363
1364 out_free_id:
1365         kfree(*id);
1366         return error;
1367 }
1368
1369 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1370                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1371 {
1372         union nvme_result res = { 0 };
1373         struct nvme_command c;
1374         int ret;
1375
1376         memset(&c, 0, sizeof(c));
1377         c.features.opcode = op;
1378         c.features.fid = cpu_to_le32(fid);
1379         c.features.dword11 = cpu_to_le32(dword11);
1380
1381         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1382                         buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
1383         if (ret >= 0 && result)
1384                 *result = le32_to_cpu(res.u32);
1385         return ret;
1386 }
1387
1388 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1389                       unsigned int dword11, void *buffer, size_t buflen,
1390                       u32 *result)
1391 {
1392         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1393                              buflen, result);
1394 }
1395 EXPORT_SYMBOL_GPL(nvme_set_features);
1396
1397 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1398                       unsigned int dword11, void *buffer, size_t buflen,
1399                       u32 *result)
1400 {
1401         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1402                              buflen, result);
1403 }
1404 EXPORT_SYMBOL_GPL(nvme_get_features);
1405
1406 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1407 {
1408         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1409         u32 result;
1410         int status, nr_io_queues;
1411
1412         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1413                         &result);
1414         if (status < 0)
1415                 return status;
1416
1417         /*
1418          * Degraded controllers might return an error when setting the queue
1419          * count.  We still want to be able to bring them online and offer
1420          * access to the admin queue, as that might be only way to fix them up.
1421          */
1422         if (status > 0) {
1423                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1424                 *count = 0;
1425         } else {
1426                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1427                 *count = min(*count, nr_io_queues);
1428         }
1429
1430         return 0;
1431 }
1432 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1433
1434 #define NVME_AEN_SUPPORTED \
1435         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1436          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1437
1438 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1439 {
1440         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1441         int status;
1442
1443         if (!supported_aens)
1444                 return;
1445
1446         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1447                         NULL, 0, &result);
1448         if (status)
1449                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1450                          supported_aens);
1451
1452         queue_work(nvme_wq, &ctrl->async_event_work);
1453 }
1454
1455 /*
1456  * Convert integer values from ioctl structures to user pointers, silently
1457  * ignoring the upper bits in the compat case to match behaviour of 32-bit
1458  * kernels.
1459  */
1460 static void __user *nvme_to_user_ptr(uintptr_t ptrval)
1461 {
1462         if (in_compat_syscall())
1463                 ptrval = (compat_uptr_t)ptrval;
1464         return (void __user *)ptrval;
1465 }
1466
1467 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1468 {
1469         struct nvme_user_io io;
1470         struct nvme_command c;
1471         unsigned length, meta_len;
1472         void __user *metadata;
1473
1474         if (copy_from_user(&io, uio, sizeof(io)))
1475                 return -EFAULT;
1476         if (io.flags)
1477                 return -EINVAL;
1478
1479         switch (io.opcode) {
1480         case nvme_cmd_write:
1481         case nvme_cmd_read:
1482         case nvme_cmd_compare:
1483                 break;
1484         default:
1485                 return -EINVAL;
1486         }
1487
1488         length = (io.nblocks + 1) << ns->lba_shift;
1489         meta_len = (io.nblocks + 1) * ns->ms;
1490         metadata = nvme_to_user_ptr(io.metadata);
1491
1492         if (ns->features & NVME_NS_EXT_LBAS) {
1493                 length += meta_len;
1494                 meta_len = 0;
1495         } else if (meta_len) {
1496                 if ((io.metadata & 3) || !io.metadata)
1497                         return -EINVAL;
1498         }
1499
1500         memset(&c, 0, sizeof(c));
1501         c.rw.opcode = io.opcode;
1502         c.rw.flags = io.flags;
1503         c.rw.nsid = cpu_to_le32(ns->head->ns_id);
1504         c.rw.slba = cpu_to_le64(io.slba);
1505         c.rw.length = cpu_to_le16(io.nblocks);
1506         c.rw.control = cpu_to_le16(io.control);
1507         c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1508         c.rw.reftag = cpu_to_le32(io.reftag);
1509         c.rw.apptag = cpu_to_le16(io.apptag);
1510         c.rw.appmask = cpu_to_le16(io.appmask);
1511
1512         return nvme_submit_user_cmd(ns->queue, &c,
1513                         nvme_to_user_ptr(io.addr), length,
1514                         metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
1515 }
1516
1517 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1518                         struct nvme_passthru_cmd __user *ucmd)
1519 {
1520         struct nvme_passthru_cmd cmd;
1521         struct nvme_command c;
1522         unsigned timeout = 0;
1523         u64 result;
1524         int status;
1525
1526         if (!capable(CAP_SYS_ADMIN))
1527                 return -EACCES;
1528         if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1529                 return -EFAULT;
1530         if (cmd.flags)
1531                 return -EINVAL;
1532
1533         memset(&c, 0, sizeof(c));
1534         c.common.opcode = cmd.opcode;
1535         c.common.flags = cmd.flags;
1536         c.common.nsid = cpu_to_le32(cmd.nsid);
1537         c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1538         c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1539         c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1540         c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1541         c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1542         c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1543         c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1544         c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1545
1546         if (cmd.timeout_ms)
1547                 timeout = msecs_to_jiffies(cmd.timeout_ms);
1548
1549         status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1550                         nvme_to_user_ptr(cmd.addr), cmd.data_len,
1551                         nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1552                         0, &result, timeout);
1553
1554         if (status >= 0) {
1555                 if (put_user(result, &ucmd->result))
1556                         return -EFAULT;
1557         }
1558
1559         return status;
1560 }
1561
1562 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1563                         struct nvme_passthru_cmd64 __user *ucmd)
1564 {
1565         struct nvme_passthru_cmd64 cmd;
1566         struct nvme_command c;
1567         unsigned timeout = 0;
1568         int status;
1569
1570         if (!capable(CAP_SYS_ADMIN))
1571                 return -EACCES;
1572         if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1573                 return -EFAULT;
1574         if (cmd.flags)
1575                 return -EINVAL;
1576
1577         memset(&c, 0, sizeof(c));
1578         c.common.opcode = cmd.opcode;
1579         c.common.flags = cmd.flags;
1580         c.common.nsid = cpu_to_le32(cmd.nsid);
1581         c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1582         c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1583         c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1584         c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1585         c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1586         c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1587         c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1588         c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1589
1590         if (cmd.timeout_ms)
1591                 timeout = msecs_to_jiffies(cmd.timeout_ms);
1592
1593         status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1594                         nvme_to_user_ptr(cmd.addr), cmd.data_len,
1595                         nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1596                         0, &cmd.result, timeout);
1597
1598         if (status >= 0) {
1599                 if (put_user(cmd.result, &ucmd->result))
1600                         return -EFAULT;
1601         }
1602
1603         return status;
1604 }
1605
1606 /*
1607  * Issue ioctl requests on the first available path.  Note that unlike normal
1608  * block layer requests we will not retry failed request on another controller.
1609  */
1610 struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1611                 struct nvme_ns_head **head, int *srcu_idx)
1612 {
1613 #ifdef CONFIG_NVME_MULTIPATH
1614         if (disk->fops == &nvme_ns_head_ops) {
1615                 struct nvme_ns *ns;
1616
1617                 *head = disk->private_data;
1618                 *srcu_idx = srcu_read_lock(&(*head)->srcu);
1619                 ns = nvme_find_path(*head);
1620                 if (!ns)
1621                         srcu_read_unlock(&(*head)->srcu, *srcu_idx);
1622                 return ns;
1623         }
1624 #endif
1625         *head = NULL;
1626         *srcu_idx = -1;
1627         return disk->private_data;
1628 }
1629
1630 void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1631 {
1632         if (head)
1633                 srcu_read_unlock(&head->srcu, idx);
1634 }
1635
1636 static bool is_ctrl_ioctl(unsigned int cmd)
1637 {
1638         if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
1639                 return true;
1640         if (is_sed_ioctl(cmd))
1641                 return true;
1642         return false;
1643 }
1644
1645 static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
1646                                   void __user *argp,
1647                                   struct nvme_ns_head *head,
1648                                   int srcu_idx)
1649 {
1650         struct nvme_ctrl *ctrl = ns->ctrl;
1651         int ret;
1652
1653         nvme_get_ctrl(ns->ctrl);
1654         nvme_put_ns_from_disk(head, srcu_idx);
1655
1656         switch (cmd) {
1657         case NVME_IOCTL_ADMIN_CMD:
1658                 ret = nvme_user_cmd(ctrl, NULL, argp);
1659                 break;
1660         case NVME_IOCTL_ADMIN64_CMD:
1661                 ret = nvme_user_cmd64(ctrl, NULL, argp);
1662                 break;
1663         default:
1664                 ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
1665                 break;
1666         }
1667         nvme_put_ctrl(ctrl);
1668         return ret;
1669 }
1670
1671 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1672                 unsigned int cmd, unsigned long arg)
1673 {
1674         struct nvme_ns_head *head = NULL;
1675         void __user *argp = (void __user *)arg;
1676         struct nvme_ns *ns;
1677         int srcu_idx, ret;
1678
1679         ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1680         if (unlikely(!ns))
1681                 return -EWOULDBLOCK;
1682
1683         /*
1684          * Handle ioctls that apply to the controller instead of the namespace
1685          * seperately and drop the ns SRCU reference early.  This avoids a
1686          * deadlock when deleting namespaces using the passthrough interface.
1687          */
1688         if (is_ctrl_ioctl(cmd))
1689                 return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
1690
1691         switch (cmd) {
1692         case NVME_IOCTL_ID:
1693                 force_successful_syscall_return();
1694                 ret = ns->head->ns_id;
1695                 break;
1696         case NVME_IOCTL_IO_CMD:
1697                 ret = nvme_user_cmd(ns->ctrl, ns, argp);
1698                 break;
1699         case NVME_IOCTL_SUBMIT_IO:
1700                 ret = nvme_submit_io(ns, argp);
1701                 break;
1702         case NVME_IOCTL_IO64_CMD:
1703                 ret = nvme_user_cmd64(ns->ctrl, ns, argp);
1704                 break;
1705         default:
1706                 if (ns->ndev)
1707                         ret = nvme_nvm_ioctl(ns, cmd, arg);
1708                 else
1709                         ret = -ENOTTY;
1710         }
1711
1712         nvme_put_ns_from_disk(head, srcu_idx);
1713         return ret;
1714 }
1715
1716 #ifdef CONFIG_COMPAT
1717 struct nvme_user_io32 {
1718         __u8    opcode;
1719         __u8    flags;
1720         __u16   control;
1721         __u16   nblocks;
1722         __u16   rsvd;
1723         __u64   metadata;
1724         __u64   addr;
1725         __u64   slba;
1726         __u32   dsmgmt;
1727         __u32   reftag;
1728         __u16   apptag;
1729         __u16   appmask;
1730 } __attribute__((__packed__));
1731
1732 #define NVME_IOCTL_SUBMIT_IO32  _IOW('N', 0x42, struct nvme_user_io32)
1733
1734 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1735                 unsigned int cmd, unsigned long arg)
1736 {
1737         /*
1738          * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
1739          * between 32 bit programs and 64 bit kernel.
1740          * The cause is that the results of sizeof(struct nvme_user_io),
1741          * which is used to define NVME_IOCTL_SUBMIT_IO,
1742          * are not same between 32 bit compiler and 64 bit compiler.
1743          * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
1744          * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
1745          * Other IOCTL numbers are same between 32 bit and 64 bit.
1746          * So there is nothing to do regarding to other IOCTL numbers.
1747          */
1748         if (cmd == NVME_IOCTL_SUBMIT_IO32)
1749                 return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
1750
1751         return nvme_ioctl(bdev, mode, cmd, arg);
1752 }
1753 #else
1754 #define nvme_compat_ioctl       NULL
1755 #endif /* CONFIG_COMPAT */
1756
1757 static int nvme_open(struct block_device *bdev, fmode_t mode)
1758 {
1759         struct nvme_ns *ns = bdev->bd_disk->private_data;
1760
1761 #ifdef CONFIG_NVME_MULTIPATH
1762         /* should never be called due to GENHD_FL_HIDDEN */
1763         if (WARN_ON_ONCE(ns->head->disk))
1764                 goto fail;
1765 #endif
1766         if (!kref_get_unless_zero(&ns->kref))
1767                 goto fail;
1768         if (!try_module_get(ns->ctrl->ops->module))
1769                 goto fail_put_ns;
1770
1771         return 0;
1772
1773 fail_put_ns:
1774         nvme_put_ns(ns);
1775 fail:
1776         return -ENXIO;
1777 }
1778
1779 static void nvme_release(struct gendisk *disk, fmode_t mode)
1780 {
1781         struct nvme_ns *ns = disk->private_data;
1782
1783         module_put(ns->ctrl->ops->module);
1784         nvme_put_ns(ns);
1785 }
1786
1787 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1788 {
1789         /* some standard values */
1790         geo->heads = 1 << 6;
1791         geo->sectors = 1 << 5;
1792         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1793         return 0;
1794 }
1795
1796 #ifdef CONFIG_BLK_DEV_INTEGRITY
1797 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1798                                 u32 max_integrity_segments)
1799 {
1800         struct blk_integrity integrity;
1801
1802         memset(&integrity, 0, sizeof(integrity));
1803         switch (pi_type) {
1804         case NVME_NS_DPS_PI_TYPE3:
1805                 integrity.profile = &t10_pi_type3_crc;
1806                 integrity.tag_size = sizeof(u16) + sizeof(u32);
1807                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1808                 break;
1809         case NVME_NS_DPS_PI_TYPE1:
1810         case NVME_NS_DPS_PI_TYPE2:
1811                 integrity.profile = &t10_pi_type1_crc;
1812                 integrity.tag_size = sizeof(u16);
1813                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1814                 break;
1815         default:
1816                 integrity.profile = NULL;
1817                 break;
1818         }
1819         integrity.tuple_size = ms;
1820         blk_integrity_register(disk, &integrity);
1821         blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
1822 }
1823 #else
1824 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1825                                 u32 max_integrity_segments)
1826 {
1827 }
1828 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1829
1830 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
1831 {
1832         struct nvme_ctrl *ctrl = ns->ctrl;
1833         struct request_queue *queue = disk->queue;
1834         u32 size = queue_logical_block_size(queue);
1835
1836         if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
1837                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
1838                 return;
1839         }
1840
1841         if (ctrl->nr_streams && ns->sws && ns->sgs)
1842                 size *= ns->sws * ns->sgs;
1843
1844         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1845                         NVME_DSM_MAX_RANGES);
1846
1847         queue->limits.discard_alignment = 0;
1848         queue->limits.discard_granularity = size;
1849
1850         /* If discard is already enabled, don't reset queue limits */
1851         if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
1852                 return;
1853
1854         blk_queue_max_discard_sectors(queue, UINT_MAX);
1855         blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
1856
1857         if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1858                 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1859 }
1860
1861 static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
1862 {
1863         u64 max_blocks;
1864
1865         if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) ||
1866             (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
1867                 return;
1868         /*
1869          * Even though NVMe spec explicitly states that MDTS is not
1870          * applicable to the write-zeroes:- "The restriction does not apply to
1871          * commands that do not transfer data between the host and the
1872          * controller (e.g., Write Uncorrectable ro Write Zeroes command).".
1873          * In order to be more cautious use controller's max_hw_sectors value
1874          * to configure the maximum sectors for the write-zeroes which is
1875          * configured based on the controller's MDTS field in the
1876          * nvme_init_identify() if available.
1877          */
1878         if (ns->ctrl->max_hw_sectors == UINT_MAX)
1879                 max_blocks = (u64)USHRT_MAX + 1;
1880         else
1881                 max_blocks = ns->ctrl->max_hw_sectors + 1;
1882
1883         blk_queue_max_write_zeroes_sectors(disk->queue,
1884                                            nvme_lba_to_sect(ns, max_blocks));
1885 }
1886
1887 static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1888                 struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1889 {
1890         memset(ids, 0, sizeof(*ids));
1891
1892         if (ctrl->vs >= NVME_VS(1, 1, 0))
1893                 memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1894         if (ctrl->vs >= NVME_VS(1, 2, 0))
1895                 memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1896         if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
1897                 return nvme_identify_ns_descs(ctrl, nsid, ids);
1898         return 0;
1899 }
1900
1901 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
1902 {
1903         return !uuid_is_null(&ids->uuid) ||
1904                 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
1905                 memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
1906 }
1907
1908 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1909 {
1910         return uuid_equal(&a->uuid, &b->uuid) &&
1911                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1912                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1913                 a->csi == b->csi;
1914 }
1915
1916 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1917                                  u32 *phys_bs, u32 *io_opt)
1918 {
1919         struct streams_directive_params s;
1920         int ret;
1921
1922         if (!ctrl->nr_streams)
1923                 return 0;
1924
1925         ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
1926         if (ret)
1927                 return ret;
1928
1929         ns->sws = le32_to_cpu(s.sws);
1930         ns->sgs = le16_to_cpu(s.sgs);
1931
1932         if (ns->sws) {
1933                 *phys_bs = ns->sws * (1 << ns->lba_shift);
1934                 if (ns->sgs)
1935                         *io_opt = *phys_bs * ns->sgs;
1936         }
1937
1938         return 0;
1939 }
1940
1941 static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
1942 {
1943         struct nvme_ctrl *ctrl = ns->ctrl;
1944
1945         /*
1946          * The PI implementation requires the metadata size to be equal to the
1947          * t10 pi tuple size.
1948          */
1949         ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1950         if (ns->ms == sizeof(struct t10_pi_tuple))
1951                 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1952         else
1953                 ns->pi_type = 0;
1954
1955         ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1956         if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1957                 return 0;
1958         if (ctrl->ops->flags & NVME_F_FABRICS) {
1959                 /*
1960                  * The NVMe over Fabrics specification only supports metadata as
1961                  * part of the extended data LBA.  We rely on HCA/HBA support to
1962                  * remap the separate metadata buffer from the block layer.
1963                  */
1964                 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1965                         return -EINVAL;
1966                 if (ctrl->max_integrity_segments)
1967                         ns->features |=
1968                                 (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1969         } else {
1970                 /*
1971                  * For PCIe controllers, we can't easily remap the separate
1972                  * metadata buffer from the block layer and thus require a
1973                  * separate metadata buffer for block layer metadata/PI support.
1974                  * We allow extended LBAs for the passthrough interface, though.
1975                  */
1976                 if (id->flbas & NVME_NS_FLBAS_META_EXT)
1977                         ns->features |= NVME_NS_EXT_LBAS;
1978                 else
1979                         ns->features |= NVME_NS_METADATA_SUPPORTED;
1980         }
1981
1982         return 0;
1983 }
1984
1985 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1986                 struct request_queue *q)
1987 {
1988         bool vwc = false;
1989
1990         if (ctrl->max_hw_sectors) {
1991                 u32 max_segments =
1992                         (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
1993
1994                 max_segments = min_not_zero(max_segments, ctrl->max_segments);
1995                 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1996                 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1997         }
1998         blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
1999         blk_queue_dma_alignment(q, 7);
2000         if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
2001                 vwc = true;
2002         blk_queue_write_cache(q, vwc, vwc);
2003 }
2004
2005 static void nvme_update_disk_info(struct gendisk *disk,
2006                 struct nvme_ns *ns, struct nvme_id_ns *id)
2007 {
2008         sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
2009         unsigned short bs = 1 << ns->lba_shift;
2010         u32 atomic_bs, phys_bs, io_opt = 0;
2011
2012         /*
2013          * The block layer can't support LBA sizes larger than the page size
2014          * yet, so catch this early and don't allow block I/O.
2015          */
2016         if (ns->lba_shift > PAGE_SHIFT) {
2017                 capacity = 0;
2018                 bs = (1 << 9);
2019         }
2020
2021         blk_integrity_unregister(disk);
2022
2023         atomic_bs = phys_bs = bs;
2024         nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
2025         if (id->nabo == 0) {
2026                 /*
2027                  * Bit 1 indicates whether NAWUPF is defined for this namespace
2028                  * and whether it should be used instead of AWUPF. If NAWUPF ==
2029                  * 0 then AWUPF must be used instead.
2030                  */
2031                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2032                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2033                 else
2034                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2035         }
2036
2037         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2038                 /* NPWG = Namespace Preferred Write Granularity */
2039                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2040                 /* NOWS = Namespace Optimal Write Size */
2041                 io_opt = bs * (1 + le16_to_cpu(id->nows));
2042         }
2043
2044         blk_queue_logical_block_size(disk->queue, bs);
2045         /*
2046          * Linux filesystems assume writing a single physical block is
2047          * an atomic operation. Hence limit the physical block size to the
2048          * value of the Atomic Write Unit Power Fail parameter.
2049          */
2050         blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
2051         blk_queue_io_min(disk->queue, phys_bs);
2052         blk_queue_io_opt(disk->queue, io_opt);
2053
2054         /*
2055          * Register a metadata profile for PI, or the plain non-integrity NVMe
2056          * metadata masquerading as Type 0 if supported, otherwise reject block
2057          * I/O to namespaces with metadata except when the namespace supports
2058          * PI, as it can strip/insert in that case.
2059          */
2060         if (ns->ms) {
2061                 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2062                     (ns->features & NVME_NS_METADATA_SUPPORTED))
2063                         nvme_init_integrity(disk, ns->ms, ns->pi_type,
2064                                             ns->ctrl->max_integrity_segments);
2065                 else if (!nvme_ns_has_pi(ns))
2066                         capacity = 0;
2067         }
2068
2069         set_capacity_revalidate_and_notify(disk, capacity, false);
2070
2071         nvme_config_discard(disk, ns);
2072         nvme_config_write_zeroes(disk, ns);
2073
2074         if (id->nsattr & NVME_NS_ATTR_RO)
2075                 set_disk_ro(disk, true);
2076         else
2077                 set_disk_ro(disk, false);
2078 }
2079
2080 static inline bool nvme_first_scan(struct gendisk *disk)
2081 {
2082         /* nvme_alloc_ns() scans the disk prior to adding it */
2083         return !(disk->flags & GENHD_FL_UP);
2084 }
2085
2086 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
2087 {
2088         struct nvme_ctrl *ctrl = ns->ctrl;
2089         u32 iob;
2090
2091         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2092             is_power_of_2(ctrl->max_hw_sectors))
2093                 iob = ctrl->max_hw_sectors;
2094         else
2095                 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
2096
2097         if (!iob)
2098                 return;
2099
2100         if (!is_power_of_2(iob)) {
2101                 if (nvme_first_scan(ns->disk))
2102                         pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2103                                 ns->disk->disk_name, iob);
2104                 return;
2105         }
2106
2107         if (blk_queue_is_zoned(ns->disk->queue)) {
2108                 if (nvme_first_scan(ns->disk))
2109                         pr_warn("%s: ignoring zoned namespace IO boundary\n",
2110                                 ns->disk->disk_name);
2111                 return;
2112         }
2113
2114         blk_queue_chunk_sectors(ns->queue, iob);
2115 }
2116
2117 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
2118 {
2119         unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
2120         struct nvme_ctrl *ctrl = ns->ctrl;
2121         int ret;
2122
2123         blk_mq_freeze_queue(ns->disk->queue);
2124         ns->lba_shift = id->lbaf[lbaf].ds;
2125         nvme_set_queue_limits(ctrl, ns->queue);
2126
2127         switch (ns->head->ids.csi) {
2128         case NVME_CSI_NVM:
2129                 break;
2130         case NVME_CSI_ZNS:
2131                 ret = nvme_update_zone_info(ns, lbaf);
2132                 if (ret) {
2133                         dev_warn(ctrl->device,
2134                                 "failed to add zoned namespace:%u ret:%d\n",
2135                                 ns->head->ns_id, ret);
2136                         goto out_unfreeze;
2137                 }
2138                 break;
2139         default:
2140                 dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
2141                         ns->head->ids.csi, ns->head->ns_id);
2142                 ret = -ENODEV;
2143                 goto out_unfreeze;
2144         }
2145
2146         ret = nvme_configure_metadata(ns, id);
2147         if (ret)
2148                 goto out_unfreeze;
2149         nvme_set_chunk_sectors(ns, id);
2150         nvme_update_disk_info(ns->disk, ns, id);
2151         blk_mq_unfreeze_queue(ns->disk->queue);
2152
2153         if (blk_queue_is_zoned(ns->queue)) {
2154                 ret = nvme_revalidate_zones(ns);
2155                 if (ret)
2156                         return ret;
2157         }
2158
2159 #ifdef CONFIG_NVME_MULTIPATH
2160         if (ns->head->disk) {
2161                 blk_mq_freeze_queue(ns->head->disk->queue);
2162                 nvme_update_disk_info(ns->head->disk, ns, id);
2163                 blk_stack_limits(&ns->head->disk->queue->limits,
2164                                  &ns->queue->limits, 0);
2165                 blk_queue_update_readahead(ns->head->disk->queue);
2166                 nvme_update_bdev_size(ns->head->disk);
2167                 blk_mq_unfreeze_queue(ns->head->disk->queue);
2168         }
2169 #endif
2170         return 0;
2171
2172 out_unfreeze:
2173         blk_mq_unfreeze_queue(ns->disk->queue);
2174         return ret;
2175 }
2176
2177 static int nvme_validate_ns(struct nvme_ns *ns)
2178 {
2179         struct nvme_ctrl *ctrl = ns->ctrl;
2180         struct nvme_id_ns *id;
2181         struct nvme_ns_ids ids;
2182         int ret = 0;
2183
2184         if (test_bit(NVME_NS_DEAD, &ns->flags)) {
2185                 set_capacity(ns->disk, 0);
2186                 return -ENODEV;
2187         }
2188
2189         ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
2190         if (ret)
2191                 goto out;
2192
2193         ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
2194         if (ret)
2195                 goto free_id;
2196
2197         if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
2198                 dev_err(ctrl->device,
2199                         "identifiers changed for nsid %d\n", ns->head->ns_id);
2200                 ret = -ENODEV;
2201                 goto free_id;
2202         }
2203
2204         ret = nvme_update_ns_info(ns, id);
2205 free_id:
2206         kfree(id);
2207 out:
2208         /*
2209          * Only fail the function if we got a fatal error back from the
2210          * device, otherwise ignore the error and just move on.
2211          */
2212         if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
2213                 ret = 0;
2214         else if (ret > 0)
2215                 ret = blk_status_to_errno(nvme_error_status(ret));
2216         return ret;
2217 }
2218
2219 static char nvme_pr_type(enum pr_type type)
2220 {
2221         switch (type) {
2222         case PR_WRITE_EXCLUSIVE:
2223                 return 1;
2224         case PR_EXCLUSIVE_ACCESS:
2225                 return 2;
2226         case PR_WRITE_EXCLUSIVE_REG_ONLY:
2227                 return 3;
2228         case PR_EXCLUSIVE_ACCESS_REG_ONLY:
2229                 return 4;
2230         case PR_WRITE_EXCLUSIVE_ALL_REGS:
2231                 return 5;
2232         case PR_EXCLUSIVE_ACCESS_ALL_REGS:
2233                 return 6;
2234         default:
2235                 return 0;
2236         }
2237 };
2238
2239 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
2240                                 u64 key, u64 sa_key, u8 op)
2241 {
2242         struct nvme_ns_head *head = NULL;
2243         struct nvme_ns *ns;
2244         struct nvme_command c;
2245         int srcu_idx, ret;
2246         u8 data[16] = { 0, };
2247
2248         ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
2249         if (unlikely(!ns))
2250                 return -EWOULDBLOCK;
2251
2252         put_unaligned_le64(key, &data[0]);
2253         put_unaligned_le64(sa_key, &data[8]);
2254
2255         memset(&c, 0, sizeof(c));
2256         c.common.opcode = op;
2257         c.common.nsid = cpu_to_le32(ns->head->ns_id);
2258         c.common.cdw10 = cpu_to_le32(cdw10);
2259
2260         ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
2261         nvme_put_ns_from_disk(head, srcu_idx);
2262         return ret;
2263 }
2264
2265 static int nvme_pr_register(struct block_device *bdev, u64 old,
2266                 u64 new, unsigned flags)
2267 {
2268         u32 cdw10;
2269
2270         if (flags & ~PR_FL_IGNORE_KEY)
2271                 return -EOPNOTSUPP;
2272
2273         cdw10 = old ? 2 : 0;
2274         cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
2275         cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
2276         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
2277 }
2278
2279 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
2280                 enum pr_type type, unsigned flags)
2281 {
2282         u32 cdw10;
2283
2284         if (flags & ~PR_FL_IGNORE_KEY)
2285                 return -EOPNOTSUPP;
2286
2287         cdw10 = nvme_pr_type(type) << 8;
2288         cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
2289         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
2290 }
2291
2292 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
2293                 enum pr_type type, bool abort)
2294 {
2295         u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
2296         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
2297 }
2298
2299 static int nvme_pr_clear(struct block_device *bdev, u64 key)
2300 {
2301         u32 cdw10 = 1 | (key ? 1 << 3 : 0);
2302         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
2303 }
2304
2305 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2306 {
2307         u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
2308         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2309 }
2310
2311 static const struct pr_ops nvme_pr_ops = {
2312         .pr_register    = nvme_pr_register,
2313         .pr_reserve     = nvme_pr_reserve,
2314         .pr_release     = nvme_pr_release,
2315         .pr_preempt     = nvme_pr_preempt,
2316         .pr_clear       = nvme_pr_clear,
2317 };
2318
2319 #ifdef CONFIG_BLK_SED_OPAL
2320 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2321                 bool send)
2322 {
2323         struct nvme_ctrl *ctrl = data;
2324         struct nvme_command cmd;
2325
2326         memset(&cmd, 0, sizeof(cmd));
2327         if (send)
2328                 cmd.common.opcode = nvme_admin_security_send;
2329         else
2330                 cmd.common.opcode = nvme_admin_security_recv;
2331         cmd.common.nsid = 0;
2332         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2333         cmd.common.cdw11 = cpu_to_le32(len);
2334
2335         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2336                                       ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
2337 }
2338 EXPORT_SYMBOL_GPL(nvme_sec_submit);
2339 #endif /* CONFIG_BLK_SED_OPAL */
2340
2341 static const struct block_device_operations nvme_fops = {
2342         .owner          = THIS_MODULE,
2343         .ioctl          = nvme_ioctl,
2344         .compat_ioctl   = nvme_compat_ioctl,
2345         .open           = nvme_open,
2346         .release        = nvme_release,
2347         .getgeo         = nvme_getgeo,
2348         .report_zones   = nvme_report_zones,
2349         .pr_ops         = &nvme_pr_ops,
2350 };
2351
2352 #ifdef CONFIG_NVME_MULTIPATH
2353 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
2354 {
2355         struct nvme_ns_head *head = bdev->bd_disk->private_data;
2356
2357         if (!kref_get_unless_zero(&head->ref))
2358                 return -ENXIO;
2359         return 0;
2360 }
2361
2362 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
2363 {
2364         nvme_put_ns_head(disk->private_data);
2365 }
2366
2367 const struct block_device_operations nvme_ns_head_ops = {
2368         .owner          = THIS_MODULE,
2369         .submit_bio     = nvme_ns_head_submit_bio,
2370         .open           = nvme_ns_head_open,
2371         .release        = nvme_ns_head_release,
2372         .ioctl          = nvme_ioctl,
2373         .compat_ioctl   = nvme_compat_ioctl,
2374         .getgeo         = nvme_getgeo,
2375         .report_zones   = nvme_report_zones,
2376         .pr_ops         = &nvme_pr_ops,
2377 };
2378 #endif /* CONFIG_NVME_MULTIPATH */
2379
2380 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
2381 {
2382         unsigned long timeout =
2383                 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
2384         u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
2385         int ret;
2386
2387         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2388                 if (csts == ~0)
2389                         return -ENODEV;
2390                 if ((csts & NVME_CSTS_RDY) == bit)
2391                         break;
2392
2393                 usleep_range(1000, 2000);
2394                 if (fatal_signal_pending(current))
2395                         return -EINTR;
2396                 if (time_after(jiffies, timeout)) {
2397                         dev_err(ctrl->device,
2398                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2399                                 enabled ? "initialisation" : "reset", csts);
2400                         return -ENODEV;
2401                 }
2402         }
2403
2404         return ret;
2405 }
2406
2407 /*
2408  * If the device has been passed off to us in an enabled state, just clear
2409  * the enabled bit.  The spec says we should set the 'shutdown notification
2410  * bits', but doing so may cause the device to complete commands to the
2411  * admin queue ... and we don't know what memory that might be pointing at!
2412  */
2413 int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
2414 {
2415         int ret;
2416
2417         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2418         ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2419
2420         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2421         if (ret)
2422                 return ret;
2423
2424         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2425                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2426
2427         return nvme_wait_ready(ctrl, ctrl->cap, false);
2428 }
2429 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2430
2431 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2432 {
2433         unsigned dev_page_min;
2434         int ret;
2435
2436         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2437         if (ret) {
2438                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2439                 return ret;
2440         }
2441         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2442
2443         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2444                 dev_err(ctrl->device,
2445                         "Minimum device page size %u too large for host (%u)\n",
2446                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2447                 return -ENODEV;
2448         }
2449
2450         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2451                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2452         else
2453                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2454         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2455         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2456         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2457         ctrl->ctrl_config |= NVME_CC_ENABLE;
2458
2459         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2460         if (ret)
2461                 return ret;
2462         return nvme_wait_ready(ctrl, ctrl->cap, true);
2463 }
2464 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2465
2466 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
2467 {
2468         unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
2469         u32 csts;
2470         int ret;
2471
2472         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2473         ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2474
2475         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2476         if (ret)
2477                 return ret;
2478
2479         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2480                 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
2481                         break;
2482
2483                 msleep(100);
2484                 if (fatal_signal_pending(current))
2485                         return -EINTR;
2486                 if (time_after(jiffies, timeout)) {
2487                         dev_err(ctrl->device,
2488                                 "Device shutdown incomplete; abort shutdown\n");
2489                         return -ENODEV;
2490                 }
2491         }
2492
2493         return ret;
2494 }
2495 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
2496
2497 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2498 {
2499         __le64 ts;
2500         int ret;
2501
2502         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2503                 return 0;
2504
2505         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2506         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2507                         NULL);
2508         if (ret)
2509                 dev_warn_once(ctrl->device,
2510                         "could not set timestamp (%d)\n", ret);
2511         return ret;
2512 }
2513
2514 static int nvme_configure_acre(struct nvme_ctrl *ctrl)
2515 {
2516         struct nvme_feat_host_behavior *host;
2517         int ret;
2518
2519         /* Don't bother enabling the feature if retry delay is not reported */
2520         if (!ctrl->crdt[0])
2521                 return 0;
2522
2523         host = kzalloc(sizeof(*host), GFP_KERNEL);
2524         if (!host)
2525                 return 0;
2526
2527         host->acre = NVME_ENABLE_ACRE;
2528         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2529                                 host, sizeof(*host), NULL);
2530         kfree(host);
2531         return ret;
2532 }
2533
2534 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2535 {
2536         /*
2537          * APST (Autonomous Power State Transition) lets us program a
2538          * table of power state transitions that the controller will
2539          * perform automatically.  We configure it with a simple
2540          * heuristic: we are willing to spend at most 2% of the time
2541          * transitioning between power states.  Therefore, when running
2542          * in any given state, we will enter the next lower-power
2543          * non-operational state after waiting 50 * (enlat + exlat)
2544          * microseconds, as long as that state's exit latency is under
2545          * the requested maximum latency.
2546          *
2547          * We will not autonomously enter any non-operational state for
2548          * which the total latency exceeds ps_max_latency_us.  Users
2549          * can set ps_max_latency_us to zero to turn off APST.
2550          */
2551
2552         unsigned apste;
2553         struct nvme_feat_auto_pst *table;
2554         u64 max_lat_us = 0;
2555         int max_ps = -1;
2556         int ret;
2557
2558         /*
2559          * If APST isn't supported or if we haven't been initialized yet,
2560          * then don't do anything.
2561          */
2562         if (!ctrl->apsta)
2563                 return 0;
2564
2565         if (ctrl->npss > 31) {
2566                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2567                 return 0;
2568         }
2569
2570         table = kzalloc(sizeof(*table), GFP_KERNEL);
2571         if (!table)
2572                 return 0;
2573
2574         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2575                 /* Turn off APST. */
2576                 apste = 0;
2577                 dev_dbg(ctrl->device, "APST disabled\n");
2578         } else {
2579                 __le64 target = cpu_to_le64(0);
2580                 int state;
2581
2582                 /*
2583                  * Walk through all states from lowest- to highest-power.
2584                  * According to the spec, lower-numbered states use more
2585                  * power.  NPSS, despite the name, is the index of the
2586                  * lowest-power state, not the number of states.
2587                  */
2588                 for (state = (int)ctrl->npss; state >= 0; state--) {
2589                         u64 total_latency_us, exit_latency_us, transition_ms;
2590
2591                         if (target)
2592                                 table->entries[state] = target;
2593
2594                         /*
2595                          * Don't allow transitions to the deepest state
2596                          * if it's quirked off.
2597                          */
2598                         if (state == ctrl->npss &&
2599                             (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2600                                 continue;
2601
2602                         /*
2603                          * Is this state a useful non-operational state for
2604                          * higher-power states to autonomously transition to?
2605                          */
2606                         if (!(ctrl->psd[state].flags &
2607                               NVME_PS_FLAGS_NON_OP_STATE))
2608                                 continue;
2609
2610                         exit_latency_us =
2611                                 (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2612                         if (exit_latency_us > ctrl->ps_max_latency_us)
2613                                 continue;
2614
2615                         total_latency_us =
2616                                 exit_latency_us +
2617                                 le32_to_cpu(ctrl->psd[state].entry_lat);
2618
2619                         /*
2620                          * This state is good.  Use it as the APST idle
2621                          * target for higher power states.
2622                          */
2623                         transition_ms = total_latency_us + 19;
2624                         do_div(transition_ms, 20);
2625                         if (transition_ms > (1 << 24) - 1)
2626                                 transition_ms = (1 << 24) - 1;
2627
2628                         target = cpu_to_le64((state << 3) |
2629                                              (transition_ms << 8));
2630
2631                         if (max_ps == -1)
2632                                 max_ps = state;
2633
2634                         if (total_latency_us > max_lat_us)
2635                                 max_lat_us = total_latency_us;
2636                 }
2637
2638                 apste = 1;
2639
2640                 if (max_ps == -1) {
2641                         dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2642                 } else {
2643                         dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2644                                 max_ps, max_lat_us, (int)sizeof(*table), table);
2645                 }
2646         }
2647
2648         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2649                                 table, sizeof(*table), NULL);
2650         if (ret)
2651                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2652
2653         kfree(table);
2654         return ret;
2655 }
2656
2657 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2658 {
2659         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2660         u64 latency;
2661
2662         switch (val) {
2663         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2664         case PM_QOS_LATENCY_ANY:
2665                 latency = U64_MAX;
2666                 break;
2667
2668         default:
2669                 latency = val;
2670         }
2671
2672         if (ctrl->ps_max_latency_us != latency) {
2673                 ctrl->ps_max_latency_us = latency;
2674                 nvme_configure_apst(ctrl);
2675         }
2676 }
2677
2678 struct nvme_core_quirk_entry {
2679         /*
2680          * NVMe model and firmware strings are padded with spaces.  For
2681          * simplicity, strings in the quirk table are padded with NULLs
2682          * instead.
2683          */
2684         u16 vid;
2685         const char *mn;
2686         const char *fr;
2687         unsigned long quirks;
2688 };
2689
2690 static const struct nvme_core_quirk_entry core_quirks[] = {
2691         {
2692                 /*
2693                  * This Toshiba device seems to die using any APST states.  See:
2694                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2695                  */
2696                 .vid = 0x1179,
2697                 .mn = "THNSF5256GPUK TOSHIBA",
2698                 .quirks = NVME_QUIRK_NO_APST,
2699         },
2700         {
2701                 /*
2702                  * This LiteON CL1-3D*-Q11 firmware version has a race
2703                  * condition associated with actions related to suspend to idle
2704                  * LiteON has resolved the problem in future firmware
2705                  */
2706                 .vid = 0x14a4,
2707                 .fr = "22301111",
2708                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2709         }
2710 };
2711
2712 /* match is null-terminated but idstr is space-padded. */
2713 static bool string_matches(const char *idstr, const char *match, size_t len)
2714 {
2715         size_t matchlen;
2716
2717         if (!match)
2718                 return true;
2719
2720         matchlen = strlen(match);
2721         WARN_ON_ONCE(matchlen > len);
2722
2723         if (memcmp(idstr, match, matchlen))
2724                 return false;
2725
2726         for (; matchlen < len; matchlen++)
2727                 if (idstr[matchlen] != ' ')
2728                         return false;
2729
2730         return true;
2731 }
2732
2733 static bool quirk_matches(const struct nvme_id_ctrl *id,
2734                           const struct nvme_core_quirk_entry *q)
2735 {
2736         return q->vid == le16_to_cpu(id->vid) &&
2737                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2738                 string_matches(id->fr, q->fr, sizeof(id->fr));
2739 }
2740
2741 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2742                 struct nvme_id_ctrl *id)
2743 {
2744         size_t nqnlen;
2745         int off;
2746
2747         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2748                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2749                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2750                         strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2751                         return;
2752                 }
2753
2754                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2755                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2756         }
2757
2758         /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
2759         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2760                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2761                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2762         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2763         off += sizeof(id->sn);
2764         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2765         off += sizeof(id->mn);
2766         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2767 }
2768
2769 static void nvme_release_subsystem(struct device *dev)
2770 {
2771         struct nvme_subsystem *subsys =
2772                 container_of(dev, struct nvme_subsystem, dev);
2773
2774         if (subsys->instance >= 0)
2775                 ida_simple_remove(&nvme_instance_ida, subsys->instance);
2776         kfree(subsys);
2777 }
2778
2779 static void nvme_destroy_subsystem(struct kref *ref)
2780 {
2781         struct nvme_subsystem *subsys =
2782                         container_of(ref, struct nvme_subsystem, ref);
2783
2784         mutex_lock(&nvme_subsystems_lock);
2785         list_del(&subsys->entry);
2786         mutex_unlock(&nvme_subsystems_lock);
2787
2788         ida_destroy(&subsys->ns_ida);
2789         device_del(&subsys->dev);
2790         put_device(&subsys->dev);
2791 }
2792
2793 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2794 {
2795         kref_put(&subsys->ref, nvme_destroy_subsystem);
2796 }
2797
2798 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2799 {
2800         struct nvme_subsystem *subsys;
2801
2802         lockdep_assert_held(&nvme_subsystems_lock);
2803
2804         /*
2805          * Fail matches for discovery subsystems. This results
2806          * in each discovery controller bound to a unique subsystem.
2807          * This avoids issues with validating controller values
2808          * that can only be true when there is a single unique subsystem.
2809          * There may be multiple and completely independent entities
2810          * that provide discovery controllers.
2811          */
2812         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2813                 return NULL;
2814
2815         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2816                 if (strcmp(subsys->subnqn, subsysnqn))
2817                         continue;
2818                 if (!kref_get_unless_zero(&subsys->ref))
2819                         continue;
2820                 return subsys;
2821         }
2822
2823         return NULL;
2824 }
2825
2826 #define SUBSYS_ATTR_RO(_name, _mode, _show)                     \
2827         struct device_attribute subsys_attr_##_name = \
2828                 __ATTR(_name, _mode, _show, NULL)
2829
2830 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2831                                     struct device_attribute *attr,
2832                                     char *buf)
2833 {
2834         struct nvme_subsystem *subsys =
2835                 container_of(dev, struct nvme_subsystem, dev);
2836
2837         return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
2838 }
2839 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2840
2841 #define nvme_subsys_show_str_function(field)                            \
2842 static ssize_t subsys_##field##_show(struct device *dev,                \
2843                             struct device_attribute *attr, char *buf)   \
2844 {                                                                       \
2845         struct nvme_subsystem *subsys =                                 \
2846                 container_of(dev, struct nvme_subsystem, dev);          \
2847         return sprintf(buf, "%.*s\n",                                   \
2848                        (int)sizeof(subsys->field), subsys->field);      \
2849 }                                                                       \
2850 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2851
2852 nvme_subsys_show_str_function(model);
2853 nvme_subsys_show_str_function(serial);
2854 nvme_subsys_show_str_function(firmware_rev);
2855
2856 static struct attribute *nvme_subsys_attrs[] = {
2857         &subsys_attr_model.attr,
2858         &subsys_attr_serial.attr,
2859         &subsys_attr_firmware_rev.attr,
2860         &subsys_attr_subsysnqn.attr,
2861 #ifdef CONFIG_NVME_MULTIPATH
2862         &subsys_attr_iopolicy.attr,
2863 #endif
2864         NULL,
2865 };
2866
2867 static struct attribute_group nvme_subsys_attrs_group = {
2868         .attrs = nvme_subsys_attrs,
2869 };
2870
2871 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2872         &nvme_subsys_attrs_group,
2873         NULL,
2874 };
2875
2876 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2877                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2878 {
2879         struct nvme_ctrl *tmp;
2880
2881         lockdep_assert_held(&nvme_subsystems_lock);
2882
2883         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2884                 if (nvme_state_terminal(tmp))
2885                         continue;
2886
2887                 if (tmp->cntlid == ctrl->cntlid) {
2888                         dev_err(ctrl->device,
2889                                 "Duplicate cntlid %u with %s, rejecting\n",
2890                                 ctrl->cntlid, dev_name(tmp->device));
2891                         return false;
2892                 }
2893
2894                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2895                     (ctrl->opts && ctrl->opts->discovery_nqn))
2896                         continue;
2897
2898                 dev_err(ctrl->device,
2899                         "Subsystem does not support multiple controllers\n");
2900                 return false;
2901         }
2902
2903         return true;
2904 }
2905
2906 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2907 {
2908         struct nvme_subsystem *subsys, *found;
2909         int ret;
2910
2911         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2912         if (!subsys)
2913                 return -ENOMEM;
2914
2915         subsys->instance = -1;
2916         mutex_init(&subsys->lock);
2917         kref_init(&subsys->ref);
2918         INIT_LIST_HEAD(&subsys->ctrls);
2919         INIT_LIST_HEAD(&subsys->nsheads);
2920         nvme_init_subnqn(subsys, ctrl, id);
2921         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2922         memcpy(subsys->model, id->mn, sizeof(subsys->model));
2923         memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2924         subsys->vendor_id = le16_to_cpu(id->vid);
2925         subsys->cmic = id->cmic;
2926         subsys->awupf = le16_to_cpu(id->awupf);
2927 #ifdef CONFIG_NVME_MULTIPATH
2928         subsys->iopolicy = NVME_IOPOLICY_NUMA;
2929 #endif
2930
2931         subsys->dev.class = nvme_subsys_class;
2932         subsys->dev.release = nvme_release_subsystem;
2933         subsys->dev.groups = nvme_subsys_attrs_groups;
2934         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
2935         device_initialize(&subsys->dev);
2936
2937         mutex_lock(&nvme_subsystems_lock);
2938         found = __nvme_find_get_subsystem(subsys->subnqn);
2939         if (found) {
2940                 put_device(&subsys->dev);
2941                 subsys = found;
2942
2943                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2944                         ret = -EINVAL;
2945                         goto out_put_subsystem;
2946                 }
2947         } else {
2948                 ret = device_add(&subsys->dev);
2949                 if (ret) {
2950                         dev_err(ctrl->device,
2951                                 "failed to register subsystem device.\n");
2952                         put_device(&subsys->dev);
2953                         goto out_unlock;
2954                 }
2955                 ida_init(&subsys->ns_ida);
2956                 list_add_tail(&subsys->entry, &nvme_subsystems);
2957         }
2958
2959         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2960                                 dev_name(ctrl->device));
2961         if (ret) {
2962                 dev_err(ctrl->device,
2963                         "failed to create sysfs link from subsystem.\n");
2964                 goto out_put_subsystem;
2965         }
2966
2967         if (!found)
2968                 subsys->instance = ctrl->instance;
2969         ctrl->subsys = subsys;
2970         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2971         mutex_unlock(&nvme_subsystems_lock);
2972         return 0;
2973
2974 out_put_subsystem:
2975         nvme_put_subsystem(subsys);
2976 out_unlock:
2977         mutex_unlock(&nvme_subsystems_lock);
2978         return ret;
2979 }
2980
2981 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
2982                 void *log, size_t size, u64 offset)
2983 {
2984         struct nvme_command c = { };
2985         u32 dwlen = nvme_bytes_to_numd(size);
2986
2987         c.get_log_page.opcode = nvme_admin_get_log_page;
2988         c.get_log_page.nsid = cpu_to_le32(nsid);
2989         c.get_log_page.lid = log_page;
2990         c.get_log_page.lsp = lsp;
2991         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
2992         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
2993         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
2994         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
2995         c.get_log_page.csi = csi;
2996
2997         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2998 }
2999
3000 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3001                                 struct nvme_effects_log **log)
3002 {
3003         struct nvme_cel *cel = xa_load(&ctrl->cels, csi);
3004         int ret;
3005
3006         if (cel)
3007                 goto out;
3008
3009         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3010         if (!cel)
3011                 return -ENOMEM;
3012
3013         ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
3014                         &cel->log, sizeof(cel->log), 0);
3015         if (ret) {
3016                 kfree(cel);
3017                 return ret;
3018         }
3019
3020         cel->csi = csi;
3021         xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL);
3022 out:
3023         *log = &cel->log;
3024         return 0;
3025 }
3026
3027 /*
3028  * Initialize the cached copies of the Identify data and various controller
3029  * register in our nvme_ctrl structure.  This should be called as soon as
3030  * the admin queue is fully up and running.
3031  */
3032 int nvme_init_identify(struct nvme_ctrl *ctrl)
3033 {
3034         struct nvme_id_ctrl *id;
3035         int ret, page_shift;
3036         u32 max_hw_sectors;
3037         bool prev_apst_enabled;
3038
3039         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3040         if (ret) {
3041                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3042                 return ret;
3043         }
3044         page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
3045         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3046
3047         if (ctrl->vs >= NVME_VS(1, 1, 0))
3048                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3049
3050         ret = nvme_identify_ctrl(ctrl, &id);
3051         if (ret) {
3052                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3053                 return -EIO;
3054         }
3055
3056         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3057                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3058                 if (ret < 0)
3059                         goto out_free;
3060         }
3061
3062         if (!(ctrl->ops->flags & NVME_F_FABRICS))
3063                 ctrl->cntlid = le16_to_cpu(id->cntlid);
3064
3065         if (!ctrl->identified) {
3066                 int i;
3067
3068                 ret = nvme_init_subsystem(ctrl, id);
3069                 if (ret)
3070                         goto out_free;
3071
3072                 /*
3073                  * Check for quirks.  Quirk can depend on firmware version,
3074                  * so, in principle, the set of quirks present can change
3075                  * across a reset.  As a possible future enhancement, we
3076                  * could re-scan for quirks every time we reinitialize
3077                  * the device, but we'd have to make sure that the driver
3078                  * behaves intelligently if the quirks change.
3079                  */
3080                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3081                         if (quirk_matches(id, &core_quirks[i]))
3082                                 ctrl->quirks |= core_quirks[i].quirks;
3083                 }
3084         }
3085
3086         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3087                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3088                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3089         }
3090
3091         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3092         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3093         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3094
3095         ctrl->oacs = le16_to_cpu(id->oacs);
3096         ctrl->oncs = le16_to_cpu(id->oncs);
3097         ctrl->mtfa = le16_to_cpu(id->mtfa);
3098         ctrl->oaes = le32_to_cpu(id->oaes);
3099         ctrl->wctemp = le16_to_cpu(id->wctemp);
3100         ctrl->cctemp = le16_to_cpu(id->cctemp);
3101
3102         atomic_set(&ctrl->abort_limit, id->acl + 1);
3103         ctrl->vwc = id->vwc;
3104         if (id->mdts)
3105                 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
3106         else
3107                 max_hw_sectors = UINT_MAX;
3108         ctrl->max_hw_sectors =
3109                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3110
3111         nvme_set_queue_limits(ctrl, ctrl->admin_q);
3112         ctrl->sgls = le32_to_cpu(id->sgls);
3113         ctrl->kas = le16_to_cpu(id->kas);
3114         ctrl->max_namespaces = le32_to_cpu(id->mnan);
3115         ctrl->ctratt = le32_to_cpu(id->ctratt);
3116
3117         if (id->rtd3e) {
3118                 /* us -> s */
3119                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3120
3121                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3122                                                  shutdown_timeout, 60);
3123
3124                 if (ctrl->shutdown_timeout != shutdown_timeout)
3125                         dev_info(ctrl->device,
3126                                  "Shutdown timeout set to %u seconds\n",
3127                                  ctrl->shutdown_timeout);
3128         } else
3129                 ctrl->shutdown_timeout = shutdown_timeout;
3130
3131         ctrl->npss = id->npss;
3132         ctrl->apsta = id->apsta;
3133         prev_apst_enabled = ctrl->apst_enabled;
3134         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3135                 if (force_apst && id->apsta) {
3136                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3137                         ctrl->apst_enabled = true;
3138                 } else {
3139                         ctrl->apst_enabled = false;
3140                 }
3141         } else {
3142                 ctrl->apst_enabled = id->apsta;
3143         }
3144         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3145
3146         if (ctrl->ops->flags & NVME_F_FABRICS) {
3147                 ctrl->icdoff = le16_to_cpu(id->icdoff);
3148                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3149                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3150                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3151
3152                 /*
3153                  * In fabrics we need to verify the cntlid matches the
3154                  * admin connect
3155                  */
3156                 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3157                         dev_err(ctrl->device,
3158                                 "Mismatching cntlid: Connect %u vs Identify "
3159                                 "%u, rejecting\n",
3160                                 ctrl->cntlid, le16_to_cpu(id->cntlid));
3161                         ret = -EINVAL;
3162                         goto out_free;
3163                 }
3164
3165                 if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
3166                         dev_err(ctrl->device,
3167                                 "keep-alive support is mandatory for fabrics\n");
3168                         ret = -EINVAL;
3169                         goto out_free;
3170                 }
3171         } else {
3172                 ctrl->hmpre = le32_to_cpu(id->hmpre);
3173                 ctrl->hmmin = le32_to_cpu(id->hmmin);
3174                 ctrl->hmminds = le32_to_cpu(id->hmminds);
3175                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3176         }
3177
3178         ret = nvme_mpath_init(ctrl, id);
3179         kfree(id);
3180
3181         if (ret < 0)
3182                 return ret;
3183
3184         if (ctrl->apst_enabled && !prev_apst_enabled)
3185                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3186         else if (!ctrl->apst_enabled && prev_apst_enabled)
3187                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3188
3189         ret = nvme_configure_apst(ctrl);
3190         if (ret < 0)
3191                 return ret;
3192         
3193         ret = nvme_configure_timestamp(ctrl);
3194         if (ret < 0)
3195                 return ret;
3196
3197         ret = nvme_configure_directives(ctrl);
3198         if (ret < 0)
3199                 return ret;
3200
3201         ret = nvme_configure_acre(ctrl);
3202         if (ret < 0)
3203                 return ret;
3204
3205         if (!ctrl->identified)
3206                 nvme_hwmon_init(ctrl);
3207
3208         ctrl->identified = true;
3209
3210         return 0;
3211
3212 out_free:
3213         kfree(id);
3214         return ret;
3215 }
3216 EXPORT_SYMBOL_GPL(nvme_init_identify);
3217
3218 static int nvme_dev_open(struct inode *inode, struct file *file)
3219 {
3220         struct nvme_ctrl *ctrl =
3221                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3222
3223         switch (ctrl->state) {
3224         case NVME_CTRL_LIVE:
3225                 break;
3226         default:
3227                 return -EWOULDBLOCK;
3228         }
3229
3230         nvme_get_ctrl(ctrl);
3231         if (!try_module_get(ctrl->ops->module))
3232                 return -EINVAL;
3233
3234         file->private_data = ctrl;
3235         return 0;
3236 }
3237
3238 static int nvme_dev_release(struct inode *inode, struct file *file)
3239 {
3240         struct nvme_ctrl *ctrl =
3241                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3242
3243         module_put(ctrl->ops->module);
3244         nvme_put_ctrl(ctrl);
3245         return 0;
3246 }
3247
3248 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
3249 {
3250         struct nvme_ns *ns;
3251         int ret;
3252
3253         down_read(&ctrl->namespaces_rwsem);
3254         if (list_empty(&ctrl->namespaces)) {
3255                 ret = -ENOTTY;
3256                 goto out_unlock;
3257         }
3258
3259         ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
3260         if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
3261                 dev_warn(ctrl->device,
3262                         "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
3263                 ret = -EINVAL;
3264                 goto out_unlock;
3265         }
3266
3267         dev_warn(ctrl->device,
3268                 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
3269         kref_get(&ns->kref);
3270         up_read(&ctrl->namespaces_rwsem);
3271
3272         ret = nvme_user_cmd(ctrl, ns, argp);
3273         nvme_put_ns(ns);
3274         return ret;
3275
3276 out_unlock:
3277         up_read(&ctrl->namespaces_rwsem);
3278         return ret;
3279 }
3280
3281 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
3282                 unsigned long arg)
3283 {
3284         struct nvme_ctrl *ctrl = file->private_data;
3285         void __user *argp = (void __user *)arg;
3286
3287         switch (cmd) {
3288         case NVME_IOCTL_ADMIN_CMD:
3289                 return nvme_user_cmd(ctrl, NULL, argp);
3290         case NVME_IOCTL_ADMIN64_CMD:
3291                 return nvme_user_cmd64(ctrl, NULL, argp);
3292         case NVME_IOCTL_IO_CMD:
3293                 return nvme_dev_user_cmd(ctrl, argp);
3294         case NVME_IOCTL_RESET:
3295                 dev_warn(ctrl->device, "resetting controller\n");
3296                 return nvme_reset_ctrl_sync(ctrl);
3297         case NVME_IOCTL_SUBSYS_RESET:
3298                 return nvme_reset_subsystem(ctrl);
3299         case NVME_IOCTL_RESCAN:
3300                 nvme_queue_scan(ctrl);
3301                 return 0;
3302         default:
3303                 return -ENOTTY;
3304         }
3305 }
3306
3307 static const struct file_operations nvme_dev_fops = {
3308         .owner          = THIS_MODULE,
3309         .open           = nvme_dev_open,
3310         .release        = nvme_dev_release,
3311         .unlocked_ioctl = nvme_dev_ioctl,
3312         .compat_ioctl   = compat_ptr_ioctl,
3313 };
3314
3315 static ssize_t nvme_sysfs_reset(struct device *dev,
3316                                 struct device_attribute *attr, const char *buf,
3317                                 size_t count)
3318 {
3319         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3320         int ret;
3321
3322         ret = nvme_reset_ctrl_sync(ctrl);
3323         if (ret < 0)
3324                 return ret;
3325         return count;
3326 }
3327 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
3328
3329 static ssize_t nvme_sysfs_rescan(struct device *dev,
3330                                 struct device_attribute *attr, const char *buf,
3331                                 size_t count)
3332 {
3333         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3334
3335         nvme_queue_scan(ctrl);
3336         return count;
3337 }
3338 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
3339
3340 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
3341 {
3342         struct gendisk *disk = dev_to_disk(dev);
3343
3344         if (disk->fops == &nvme_fops)
3345                 return nvme_get_ns_from_dev(dev)->head;
3346         else
3347                 return disk->private_data;
3348 }
3349
3350 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
3351                 char *buf)
3352 {
3353         struct nvme_ns_head *head = dev_to_ns_head(dev);
3354         struct nvme_ns_ids *ids = &head->ids;
3355         struct nvme_subsystem *subsys = head->subsys;
3356         int serial_len = sizeof(subsys->serial);
3357         int model_len = sizeof(subsys->model);
3358
3359         if (!uuid_is_null(&ids->uuid))
3360                 return sprintf(buf, "uuid.%pU\n", &ids->uuid);
3361
3362         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3363                 return sprintf(buf, "eui.%16phN\n", ids->nguid);
3364
3365         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3366                 return sprintf(buf, "eui.%8phN\n", ids->eui64);
3367
3368         while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
3369                                   subsys->serial[serial_len - 1] == '\0'))
3370                 serial_len--;
3371         while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
3372                                  subsys->model[model_len - 1] == '\0'))
3373                 model_len--;
3374
3375         return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3376                 serial_len, subsys->serial, model_len, subsys->model,
3377                 head->ns_id);
3378 }
3379 static DEVICE_ATTR_RO(wwid);
3380
3381 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
3382                 char *buf)
3383 {
3384         return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3385 }
3386 static DEVICE_ATTR_RO(nguid);
3387
3388 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
3389                 char *buf)
3390 {
3391         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3392
3393         /* For backward compatibility expose the NGUID to userspace if
3394          * we have no UUID set
3395          */
3396         if (uuid_is_null(&ids->uuid)) {
3397                 printk_ratelimited(KERN_WARNING
3398                                    "No UUID available providing old NGUID\n");
3399                 return sprintf(buf, "%pU\n", ids->nguid);
3400         }
3401         return sprintf(buf, "%pU\n", &ids->uuid);
3402 }
3403 static DEVICE_ATTR_RO(uuid);
3404
3405 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
3406                 char *buf)
3407 {
3408         return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3409 }
3410 static DEVICE_ATTR_RO(eui);
3411
3412 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
3413                 char *buf)
3414 {
3415         return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3416 }
3417 static DEVICE_ATTR_RO(nsid);
3418
3419 static struct attribute *nvme_ns_id_attrs[] = {
3420         &dev_attr_wwid.attr,
3421         &dev_attr_uuid.attr,
3422         &dev_attr_nguid.attr,
3423         &dev_attr_eui.attr,
3424         &dev_attr_nsid.attr,
3425 #ifdef CONFIG_NVME_MULTIPATH
3426         &dev_attr_ana_grpid.attr,
3427         &dev_attr_ana_state.attr,
3428 #endif
3429         NULL,
3430 };
3431
3432 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
3433                 struct attribute *a, int n)
3434 {
3435         struct device *dev = container_of(kobj, struct device, kobj);
3436         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3437
3438         if (a == &dev_attr_uuid.attr) {
3439                 if (uuid_is_null(&ids->uuid) &&
3440                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3441                         return 0;
3442         }
3443         if (a == &dev_attr_nguid.attr) {
3444                 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3445                         return 0;
3446         }
3447         if (a == &dev_attr_eui.attr) {
3448                 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3449                         return 0;
3450         }
3451 #ifdef CONFIG_NVME_MULTIPATH
3452         if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
3453                 if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
3454                         return 0;
3455                 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
3456                         return 0;
3457         }
3458 #endif
3459         return a->mode;
3460 }
3461
3462 static const struct attribute_group nvme_ns_id_attr_group = {
3463         .attrs          = nvme_ns_id_attrs,
3464         .is_visible     = nvme_ns_id_attrs_are_visible,
3465 };
3466
3467 const struct attribute_group *nvme_ns_id_attr_groups[] = {
3468         &nvme_ns_id_attr_group,
3469 #ifdef CONFIG_NVM
3470         &nvme_nvm_attr_group,
3471 #endif
3472         NULL,
3473 };
3474
3475 #define nvme_show_str_function(field)                                           \
3476 static ssize_t  field##_show(struct device *dev,                                \
3477                             struct device_attribute *attr, char *buf)           \
3478 {                                                                               \
3479         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3480         return sprintf(buf, "%.*s\n",                                           \
3481                 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field);         \
3482 }                                                                               \
3483 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3484
3485 nvme_show_str_function(model);
3486 nvme_show_str_function(serial);
3487 nvme_show_str_function(firmware_rev);
3488
3489 #define nvme_show_int_function(field)                                           \
3490 static ssize_t  field##_show(struct device *dev,                                \
3491                             struct device_attribute *attr, char *buf)           \
3492 {                                                                               \
3493         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3494         return sprintf(buf, "%d\n", ctrl->field);       \
3495 }                                                                               \
3496 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3497
3498 nvme_show_int_function(cntlid);
3499 nvme_show_int_function(numa_node);
3500 nvme_show_int_function(queue_count);
3501 nvme_show_int_function(sqsize);
3502
3503 static ssize_t nvme_sysfs_delete(struct device *dev,
3504                                 struct device_attribute *attr, const char *buf,
3505                                 size_t count)
3506 {
3507         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3508
3509         if (device_remove_file_self(dev, attr))
3510                 nvme_delete_ctrl_sync(ctrl);
3511         return count;
3512 }
3513 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
3514
3515 static ssize_t nvme_sysfs_show_transport(struct device *dev,
3516                                          struct device_attribute *attr,
3517                                          char *buf)
3518 {
3519         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3520
3521         return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
3522 }
3523 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
3524
3525 static ssize_t nvme_sysfs_show_state(struct device *dev,
3526                                      struct device_attribute *attr,
3527                                      char *buf)
3528 {
3529         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3530         static const char *const state_name[] = {
3531                 [NVME_CTRL_NEW]         = "new",
3532                 [NVME_CTRL_LIVE]        = "live",
3533                 [NVME_CTRL_RESETTING]   = "resetting",
3534                 [NVME_CTRL_CONNECTING]  = "connecting",
3535                 [NVME_CTRL_DELETING]    = "deleting",
3536                 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
3537                 [NVME_CTRL_DEAD]        = "dead",
3538         };
3539
3540         if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
3541             state_name[ctrl->state])
3542                 return sprintf(buf, "%s\n", state_name[ctrl->state]);
3543
3544         return sprintf(buf, "unknown state\n");
3545 }
3546
3547 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
3548
3549 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
3550                                          struct device_attribute *attr,
3551                                          char *buf)
3552 {
3553         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3554
3555         return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
3556 }
3557 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
3558
3559 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3560                                         struct device_attribute *attr,
3561                                         char *buf)
3562 {
3563         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3564
3565         return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
3566 }
3567 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3568
3569 static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3570                                         struct device_attribute *attr,
3571                                         char *buf)
3572 {
3573         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3574
3575         return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
3576 }
3577 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3578
3579 static ssize_t nvme_sysfs_show_address(struct device *dev,
3580                                          struct device_attribute *attr,
3581                                          char *buf)
3582 {
3583         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3584
3585         return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
3586 }
3587 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3588
3589 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3590                 struct device_attribute *attr, char *buf)
3591 {
3592         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3593         struct nvmf_ctrl_options *opts = ctrl->opts;
3594
3595         if (ctrl->opts->max_reconnects == -1)
3596                 return sprintf(buf, "off\n");
3597         return sprintf(buf, "%d\n",
3598                         opts->max_reconnects * opts->reconnect_delay);
3599 }
3600
3601 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3602                 struct device_attribute *attr, const char *buf, size_t count)
3603 {
3604         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3605         struct nvmf_ctrl_options *opts = ctrl->opts;
3606         int ctrl_loss_tmo, err;
3607
3608         err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3609         if (err)
3610                 return -EINVAL;
3611
3612         else if (ctrl_loss_tmo < 0)
3613                 opts->max_reconnects = -1;
3614         else
3615                 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3616                                                 opts->reconnect_delay);
3617         return count;
3618 }
3619 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3620         nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3621
3622 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3623                 struct device_attribute *attr, char *buf)
3624 {
3625         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3626
3627         if (ctrl->opts->reconnect_delay == -1)
3628                 return sprintf(buf, "off\n");
3629         return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay);
3630 }
3631
3632 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3633                 struct device_attribute *attr, const char *buf, size_t count)
3634 {
3635         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3636         unsigned int v;
3637         int err;
3638
3639         err = kstrtou32(buf, 10, &v);
3640         if (err)
3641                 return err;
3642
3643         ctrl->opts->reconnect_delay = v;
3644         return count;
3645 }
3646 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3647         nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
3648
3649 static struct attribute *nvme_dev_attrs[] = {
3650         &dev_attr_reset_controller.attr,
3651         &dev_attr_rescan_controller.attr,
3652         &dev_attr_model.attr,
3653         &dev_attr_serial.attr,
3654         &dev_attr_firmware_rev.attr,
3655         &dev_attr_cntlid.attr,
3656         &dev_attr_delete_controller.attr,
3657         &dev_attr_transport.attr,
3658         &dev_attr_subsysnqn.attr,
3659         &dev_attr_address.attr,
3660         &dev_attr_state.attr,
3661         &dev_attr_numa_node.attr,
3662         &dev_attr_queue_count.attr,
3663         &dev_attr_sqsize.attr,
3664         &dev_attr_hostnqn.attr,
3665         &dev_attr_hostid.attr,
3666         &dev_attr_ctrl_loss_tmo.attr,
3667         &dev_attr_reconnect_delay.attr,
3668         NULL
3669 };
3670
3671 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
3672                 struct attribute *a, int n)
3673 {
3674         struct device *dev = container_of(kobj, struct device, kobj);
3675         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3676
3677         if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
3678                 return 0;
3679         if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3680                 return 0;
3681         if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3682                 return 0;
3683         if (a == &dev_attr_hostid.attr && !ctrl->opts)
3684                 return 0;
3685         if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
3686                 return 0;
3687         if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
3688                 return 0;
3689
3690         return a->mode;
3691 }
3692
3693 static struct attribute_group nvme_dev_attrs_group = {
3694         .attrs          = nvme_dev_attrs,
3695         .is_visible     = nvme_dev_attrs_are_visible,
3696 };
3697
3698 static const struct attribute_group *nvme_dev_attr_groups[] = {
3699         &nvme_dev_attrs_group,
3700         NULL,
3701 };
3702
3703 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
3704                 unsigned nsid)
3705 {
3706         struct nvme_ns_head *h;
3707
3708         lockdep_assert_held(&subsys->lock);
3709
3710         list_for_each_entry(h, &subsys->nsheads, entry) {
3711                 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
3712                         return h;
3713         }
3714
3715         return NULL;
3716 }
3717
3718 static int __nvme_check_ids(struct nvme_subsystem *subsys,
3719                 struct nvme_ns_head *new)
3720 {
3721         struct nvme_ns_head *h;
3722
3723         lockdep_assert_held(&subsys->lock);
3724
3725         list_for_each_entry(h, &subsys->nsheads, entry) {
3726                 if (nvme_ns_ids_valid(&new->ids) &&
3727                     nvme_ns_ids_equal(&new->ids, &h->ids))
3728                         return -EINVAL;
3729         }
3730
3731         return 0;
3732 }
3733
3734 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3735                 unsigned nsid, struct nvme_ns_ids *ids)
3736 {
3737         struct nvme_ns_head *head;
3738         size_t size = sizeof(*head);
3739         int ret = -ENOMEM;
3740
3741 #ifdef CONFIG_NVME_MULTIPATH
3742         size += num_possible_nodes() * sizeof(struct nvme_ns *);
3743 #endif
3744
3745         head = kzalloc(size, GFP_KERNEL);
3746         if (!head)
3747                 goto out;
3748         ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
3749         if (ret < 0)
3750                 goto out_free_head;
3751         head->instance = ret;
3752         INIT_LIST_HEAD(&head->list);
3753         ret = init_srcu_struct(&head->srcu);
3754         if (ret)
3755                 goto out_ida_remove;
3756         head->subsys = ctrl->subsys;
3757         head->ns_id = nsid;
3758         head->ids = *ids;
3759         kref_init(&head->ref);
3760
3761         ret = __nvme_check_ids(ctrl->subsys, head);
3762         if (ret) {
3763                 dev_err(ctrl->device,
3764                         "duplicate IDs for nsid %d\n", nsid);
3765                 goto out_cleanup_srcu;
3766         }
3767
3768         if (head->ids.csi) {
3769                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3770                 if (ret)
3771                         goto out_cleanup_srcu;
3772         } else
3773                 head->effects = ctrl->effects;
3774
3775         ret = nvme_mpath_alloc_disk(ctrl, head);
3776         if (ret)
3777                 goto out_cleanup_srcu;
3778
3779         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3780
3781         kref_get(&ctrl->subsys->ref);
3782
3783         return head;
3784 out_cleanup_srcu:
3785         cleanup_srcu_struct(&head->srcu);
3786 out_ida_remove:
3787         ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
3788 out_free_head:
3789         kfree(head);
3790 out:
3791         if (ret > 0)
3792                 ret = blk_status_to_errno(nvme_error_status(ret));
3793         return ERR_PTR(ret);
3794 }
3795
3796 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
3797                 struct nvme_id_ns *id)
3798 {
3799         struct nvme_ctrl *ctrl = ns->ctrl;
3800         bool is_shared = id->nmic & NVME_NS_NMIC_SHARED;
3801         struct nvme_ns_head *head = NULL;
3802         struct nvme_ns_ids ids;
3803         int ret = 0;
3804
3805         ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
3806         if (ret) {
3807                 if (ret < 0)
3808                         return ret;
3809                 return blk_status_to_errno(nvme_error_status(ret));
3810         }
3811
3812         mutex_lock(&ctrl->subsys->lock);
3813         head = nvme_find_ns_head(ctrl->subsys, nsid);
3814         if (!head) {
3815                 head = nvme_alloc_ns_head(ctrl, nsid, &ids);
3816                 if (IS_ERR(head)) {
3817                         ret = PTR_ERR(head);
3818                         goto out_unlock;
3819                 }
3820                 head->shared = is_shared;
3821         } else {
3822                 ret = -EINVAL;
3823                 if (!is_shared || !head->shared) {
3824                         dev_err(ctrl->device,
3825                                 "Duplicate unshared namespace %d\n", nsid);
3826                         goto out_put_ns_head;
3827                 }
3828                 if (!nvme_ns_ids_equal(&head->ids, &ids)) {
3829                         dev_err(ctrl->device,
3830                                 "IDs don't match for shared namespace %d\n",
3831                                         nsid);
3832                         goto out_put_ns_head;
3833                 }
3834         }
3835
3836         list_add_tail(&ns->siblings, &head->list);
3837         ns->head = head;
3838         mutex_unlock(&ctrl->subsys->lock);
3839         return 0;
3840
3841 out_put_ns_head:
3842         nvme_put_ns_head(head);
3843 out_unlock:
3844         mutex_unlock(&ctrl->subsys->lock);
3845         return ret;
3846 }
3847
3848 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
3849 {
3850         struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
3851         struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
3852
3853         return nsa->head->ns_id - nsb->head->ns_id;
3854 }
3855
3856 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3857 {
3858         struct nvme_ns *ns, *ret = NULL;
3859
3860         down_read(&ctrl->namespaces_rwsem);
3861         list_for_each_entry(ns, &ctrl->namespaces, list) {
3862                 if (ns->head->ns_id == nsid) {
3863                         if (!kref_get_unless_zero(&ns->kref))
3864                                 continue;
3865                         ret = ns;
3866                         break;
3867                 }
3868                 if (ns->head->ns_id > nsid)
3869                         break;
3870         }
3871         up_read(&ctrl->namespaces_rwsem);
3872         return ret;
3873 }
3874 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3875
3876 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3877 {
3878         struct nvme_ns *ns;
3879         struct gendisk *disk;
3880         struct nvme_id_ns *id;
3881         char disk_name[DISK_NAME_LEN];
3882         int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
3883
3884         if (nvme_identify_ns(ctrl, nsid, &id))
3885                 return;
3886
3887         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3888         if (!ns)
3889                 goto out_free_id;
3890
3891         ns->queue = blk_mq_init_queue(ctrl->tagset);
3892         if (IS_ERR(ns->queue))
3893                 goto out_free_ns;
3894
3895         if (ctrl->opts && ctrl->opts->data_digest)
3896                 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
3897
3898         blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3899         if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3900                 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
3901
3902         ns->queue->queuedata = ns;
3903         ns->ctrl = ctrl;
3904         kref_init(&ns->kref);
3905
3906         ret = nvme_init_ns_head(ns, nsid, id);
3907         if (ret)
3908                 goto out_free_queue;
3909         nvme_set_disk_name(disk_name, ns, ctrl, &flags);
3910
3911         disk = alloc_disk_node(0, node);
3912         if (!disk)
3913                 goto out_unlink_ns;
3914
3915         disk->fops = &nvme_fops;
3916         disk->private_data = ns;
3917         disk->queue = ns->queue;
3918         disk->flags = flags;
3919         memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
3920         ns->disk = disk;
3921
3922         if (nvme_update_ns_info(ns, id))
3923                 goto out_put_disk;
3924
3925         if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3926                 ret = nvme_nvm_register(ns, disk_name, node);
3927                 if (ret) {
3928                         dev_warn(ctrl->device, "LightNVM init failure\n");
3929                         goto out_put_disk;
3930                 }
3931         }
3932
3933         down_write(&ctrl->namespaces_rwsem);
3934         list_add_tail(&ns->list, &ctrl->namespaces);
3935         up_write(&ctrl->namespaces_rwsem);
3936
3937         nvme_get_ctrl(ctrl);
3938
3939         device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
3940
3941         nvme_mpath_add_disk(ns, id);
3942         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3943         kfree(id);
3944
3945         return;
3946  out_put_disk:
3947         /* prevent double queue cleanup */
3948         ns->disk->queue = NULL;
3949         put_disk(ns->disk);
3950  out_unlink_ns:
3951         mutex_lock(&ctrl->subsys->lock);
3952         list_del_rcu(&ns->siblings);
3953         if (list_empty(&ns->head->list))
3954                 list_del_init(&ns->head->entry);
3955         mutex_unlock(&ctrl->subsys->lock);
3956         nvme_put_ns_head(ns->head);
3957  out_free_queue:
3958         blk_cleanup_queue(ns->queue);
3959  out_free_ns:
3960         kfree(ns);
3961  out_free_id:
3962         kfree(id);
3963 }
3964
3965 static void nvme_ns_remove(struct nvme_ns *ns)
3966 {
3967         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3968                 return;
3969
3970         nvme_fault_inject_fini(&ns->fault_inject);
3971
3972         mutex_lock(&ns->ctrl->subsys->lock);
3973         list_del_rcu(&ns->siblings);
3974         if (list_empty(&ns->head->list))
3975                 list_del_init(&ns->head->entry);
3976         mutex_unlock(&ns->ctrl->subsys->lock);
3977
3978         synchronize_rcu(); /* guarantee not available in head->list */
3979         nvme_mpath_clear_current_path(ns);
3980         synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
3981
3982         if (ns->disk->flags & GENHD_FL_UP) {
3983                 del_gendisk(ns->disk);
3984                 blk_cleanup_queue(ns->queue);
3985                 if (blk_get_integrity(ns->disk))
3986                         blk_integrity_unregister(ns->disk);
3987         }
3988
3989         down_write(&ns->ctrl->namespaces_rwsem);
3990         list_del_init(&ns->list);
3991         up_write(&ns->ctrl->namespaces_rwsem);
3992
3993         nvme_mpath_check_last_path(ns);
3994         nvme_put_ns(ns);
3995 }
3996
3997 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
3998 {
3999         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4000
4001         if (ns) {
4002                 nvme_ns_remove(ns);
4003                 nvme_put_ns(ns);
4004         }
4005 }
4006
4007 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4008 {
4009         struct nvme_ns *ns;
4010         int ret;
4011
4012         ns = nvme_find_get_ns(ctrl, nsid);
4013         if (!ns) {
4014                 nvme_alloc_ns(ctrl, nsid);
4015                 return;
4016         }
4017
4018         ret = nvme_validate_ns(ns);
4019         revalidate_disk_size(ns->disk, ret == 0);
4020         if (ret)
4021                 nvme_ns_remove(ns);
4022         nvme_put_ns(ns);
4023 }
4024
4025 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4026                                         unsigned nsid)
4027 {
4028         struct nvme_ns *ns, *next;
4029         LIST_HEAD(rm_list);
4030
4031         down_write(&ctrl->namespaces_rwsem);
4032         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4033                 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
4034                         list_move_tail(&ns->list, &rm_list);
4035         }
4036         up_write(&ctrl->namespaces_rwsem);
4037
4038         list_for_each_entry_safe(ns, next, &rm_list, list)
4039                 nvme_ns_remove(ns);
4040
4041 }
4042
4043 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4044 {
4045         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4046         __le32 *ns_list;
4047         u32 prev = 0;
4048         int ret = 0, i;
4049
4050         if (nvme_ctrl_limited_cns(ctrl))
4051                 return -EOPNOTSUPP;
4052
4053         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4054         if (!ns_list)
4055                 return -ENOMEM;
4056
4057         for (;;) {
4058                 ret = nvme_identify_ns_list(ctrl, prev, ns_list);
4059                 if (ret)
4060                         goto free;
4061
4062                 for (i = 0; i < nr_entries; i++) {
4063                         u32 nsid = le32_to_cpu(ns_list[i]);
4064
4065                         if (!nsid)      /* end of the list? */
4066                                 goto out;
4067                         nvme_validate_or_alloc_ns(ctrl, nsid);
4068                         while (++prev < nsid)
4069                                 nvme_ns_remove_by_nsid(ctrl, prev);
4070                 }
4071         }
4072  out:
4073         nvme_remove_invalid_namespaces(ctrl, prev);
4074  free:
4075         kfree(ns_list);
4076         return ret;
4077 }
4078
4079 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4080 {
4081         struct nvme_id_ctrl *id;
4082         u32 nn, i;
4083
4084         if (nvme_identify_ctrl(ctrl, &id))
4085                 return;
4086         nn = le32_to_cpu(id->nn);
4087         kfree(id);
4088
4089         for (i = 1; i <= nn; i++)
4090                 nvme_validate_or_alloc_ns(ctrl, i);
4091
4092         nvme_remove_invalid_namespaces(ctrl, nn);
4093 }
4094
4095 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4096 {
4097         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4098         __le32 *log;
4099         int error;
4100
4101         log = kzalloc(log_size, GFP_KERNEL);
4102         if (!log)
4103                 return;
4104
4105         /*
4106          * We need to read the log to clear the AEN, but we don't want to rely
4107          * on it for the changed namespace information as userspace could have
4108          * raced with us in reading the log page, which could cause us to miss
4109          * updates.
4110          */
4111         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4112                         NVME_CSI_NVM, log, log_size, 0);
4113         if (error)
4114                 dev_warn(ctrl->device,
4115                         "reading changed ns log failed: %d\n", error);
4116
4117         kfree(log);
4118 }
4119
4120 static void nvme_scan_work(struct work_struct *work)
4121 {
4122         struct nvme_ctrl *ctrl =
4123                 container_of(work, struct nvme_ctrl, scan_work);
4124
4125         /* No tagset on a live ctrl means IO queues could not created */
4126         if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
4127                 return;
4128
4129         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4130                 dev_info(ctrl->device, "rescanning namespaces.\n");
4131                 nvme_clear_changed_ns_log(ctrl);
4132         }
4133
4134         mutex_lock(&ctrl->scan_lock);
4135         if (nvme_scan_ns_list(ctrl) != 0)
4136                 nvme_scan_ns_sequential(ctrl);
4137         mutex_unlock(&ctrl->scan_lock);
4138
4139         down_write(&ctrl->namespaces_rwsem);
4140         list_sort(NULL, &ctrl->namespaces, ns_cmp);
4141         up_write(&ctrl->namespaces_rwsem);
4142 }
4143
4144 /*
4145  * This function iterates the namespace list unlocked to allow recovery from
4146  * controller failure. It is up to the caller to ensure the namespace list is
4147  * not modified by scan work while this function is executing.
4148  */
4149 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4150 {
4151         struct nvme_ns *ns, *next;
4152         LIST_HEAD(ns_list);
4153
4154         /*
4155          * make sure to requeue I/O to all namespaces as these
4156          * might result from the scan itself and must complete
4157          * for the scan_work to make progress
4158          */
4159         nvme_mpath_clear_ctrl_paths(ctrl);
4160
4161         /* prevent racing with ns scanning */
4162         flush_work(&ctrl->scan_work);
4163
4164         /*
4165          * The dead states indicates the controller was not gracefully
4166          * disconnected. In that case, we won't be able to flush any data while
4167          * removing the namespaces' disks; fail all the queues now to avoid
4168          * potentially having to clean up the failed sync later.
4169          */
4170         if (ctrl->state == NVME_CTRL_DEAD)
4171                 nvme_kill_queues(ctrl);
4172
4173         /* this is a no-op when called from the controller reset handler */
4174         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4175
4176         down_write(&ctrl->namespaces_rwsem);
4177         list_splice_init(&ctrl->namespaces, &ns_list);
4178         up_write(&ctrl->namespaces_rwsem);
4179
4180         list_for_each_entry_safe(ns, next, &ns_list, list)
4181                 nvme_ns_remove(ns);
4182 }
4183 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4184
4185 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
4186 {
4187         struct nvme_ctrl *ctrl =
4188                 container_of(dev, struct nvme_ctrl, ctrl_device);
4189         struct nvmf_ctrl_options *opts = ctrl->opts;
4190         int ret;
4191
4192         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4193         if (ret)
4194                 return ret;
4195
4196         if (opts) {
4197                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4198                 if (ret)
4199                         return ret;
4200
4201                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4202                                 opts->trsvcid ?: "none");
4203                 if (ret)
4204                         return ret;
4205
4206                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4207                                 opts->host_traddr ?: "none");
4208         }
4209         return ret;
4210 }
4211
4212 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4213 {
4214         char *envp[2] = { NULL, NULL };
4215         u32 aen_result = ctrl->aen_result;
4216
4217         ctrl->aen_result = 0;
4218         if (!aen_result)
4219                 return;
4220
4221         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4222         if (!envp[0])
4223                 return;
4224         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4225         kfree(envp[0]);
4226 }
4227
4228 static void nvme_async_event_work(struct work_struct *work)
4229 {
4230         struct nvme_ctrl *ctrl =
4231                 container_of(work, struct nvme_ctrl, async_event_work);
4232
4233         nvme_aen_uevent(ctrl);
4234         ctrl->ops->submit_async_event(ctrl);
4235 }
4236
4237 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4238 {
4239
4240         u32 csts;
4241
4242         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4243                 return false;
4244
4245         if (csts == ~0)
4246                 return false;
4247
4248         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4249 }
4250
4251 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4252 {
4253         struct nvme_fw_slot_info_log *log;
4254
4255         log = kmalloc(sizeof(*log), GFP_KERNEL);
4256         if (!log)
4257                 return;
4258
4259         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4260                         log, sizeof(*log), 0))
4261                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4262         kfree(log);
4263 }
4264
4265 static void nvme_fw_act_work(struct work_struct *work)
4266 {
4267         struct nvme_ctrl *ctrl = container_of(work,
4268                                 struct nvme_ctrl, fw_act_work);
4269         unsigned long fw_act_timeout;
4270
4271         if (ctrl->mtfa)
4272                 fw_act_timeout = jiffies +
4273                                 msecs_to_jiffies(ctrl->mtfa * 100);
4274         else
4275                 fw_act_timeout = jiffies +
4276                                 msecs_to_jiffies(admin_timeout * 1000);
4277
4278         nvme_stop_queues(ctrl);
4279         while (nvme_ctrl_pp_status(ctrl)) {
4280                 if (time_after(jiffies, fw_act_timeout)) {
4281                         dev_warn(ctrl->device,
4282                                 "Fw activation timeout, reset controller\n");
4283                         nvme_try_sched_reset(ctrl);
4284                         return;
4285                 }
4286                 msleep(100);
4287         }
4288
4289         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4290                 return;
4291
4292         nvme_start_queues(ctrl);
4293         /* read FW slot information to clear the AER */
4294         nvme_get_fw_slot_info(ctrl);
4295 }
4296
4297 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4298 {
4299         u32 aer_notice_type = (result & 0xff00) >> 8;
4300
4301         trace_nvme_async_event(ctrl, aer_notice_type);
4302
4303         switch (aer_notice_type) {
4304         case NVME_AER_NOTICE_NS_CHANGED:
4305                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4306                 nvme_queue_scan(ctrl);
4307                 break;
4308         case NVME_AER_NOTICE_FW_ACT_STARTING:
4309                 /*
4310                  * We are (ab)using the RESETTING state to prevent subsequent
4311                  * recovery actions from interfering with the controller's
4312                  * firmware activation.
4313                  */
4314                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
4315                         queue_work(nvme_wq, &ctrl->fw_act_work);
4316                 break;
4317 #ifdef CONFIG_NVME_MULTIPATH
4318         case NVME_AER_NOTICE_ANA:
4319                 if (!ctrl->ana_log_buf)
4320                         break;
4321                 queue_work(nvme_wq, &ctrl->ana_work);
4322                 break;
4323 #endif
4324         case NVME_AER_NOTICE_DISC_CHANGED:
4325                 ctrl->aen_result = result;
4326                 break;
4327         default:
4328                 dev_warn(ctrl->device, "async event result %08x\n", result);
4329         }
4330 }
4331
4332 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4333                 volatile union nvme_result *res)
4334 {
4335         u32 result = le32_to_cpu(res->u32);
4336         u32 aer_type = result & 0x07;
4337
4338         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4339                 return;
4340
4341         switch (aer_type) {
4342         case NVME_AER_NOTICE:
4343                 nvme_handle_aen_notice(ctrl, result);
4344                 break;
4345         case NVME_AER_ERROR:
4346         case NVME_AER_SMART:
4347         case NVME_AER_CSS:
4348         case NVME_AER_VS:
4349                 trace_nvme_async_event(ctrl, aer_type);
4350                 ctrl->aen_result = result;
4351                 break;
4352         default:
4353                 break;
4354         }
4355         queue_work(nvme_wq, &ctrl->async_event_work);
4356 }
4357 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4358
4359 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4360 {
4361         nvme_mpath_stop(ctrl);
4362         nvme_stop_keep_alive(ctrl);
4363         flush_work(&ctrl->async_event_work);
4364         cancel_work_sync(&ctrl->fw_act_work);
4365 }
4366 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4367
4368 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4369 {
4370         nvme_start_keep_alive(ctrl);
4371
4372         nvme_enable_aen(ctrl);
4373
4374         if (ctrl->queue_count > 1) {
4375                 nvme_queue_scan(ctrl);
4376                 nvme_start_queues(ctrl);
4377         }
4378 }
4379 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4380
4381 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4382 {
4383         nvme_fault_inject_fini(&ctrl->fault_inject);
4384         dev_pm_qos_hide_latency_tolerance(ctrl->device);
4385         cdev_device_del(&ctrl->cdev, ctrl->device);
4386         nvme_put_ctrl(ctrl);
4387 }
4388 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4389
4390 static void nvme_free_ctrl(struct device *dev)
4391 {
4392         struct nvme_ctrl *ctrl =
4393                 container_of(dev, struct nvme_ctrl, ctrl_device);
4394         struct nvme_subsystem *subsys = ctrl->subsys;
4395
4396         if (!subsys || ctrl->instance != subsys->instance)
4397                 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4398
4399         xa_destroy(&ctrl->cels);
4400
4401         nvme_mpath_uninit(ctrl);
4402         __free_page(ctrl->discard_page);
4403
4404         if (subsys) {
4405                 mutex_lock(&nvme_subsystems_lock);
4406                 list_del(&ctrl->subsys_entry);
4407                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4408                 mutex_unlock(&nvme_subsystems_lock);
4409         }
4410
4411         ctrl->ops->free_ctrl(ctrl);
4412
4413         if (subsys)
4414                 nvme_put_subsystem(subsys);
4415 }
4416
4417 /*
4418  * Initialize a NVMe controller structures.  This needs to be called during
4419  * earliest initialization so that we have the initialized structured around
4420  * during probing.
4421  */
4422 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4423                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4424 {
4425         int ret;
4426
4427         ctrl->state = NVME_CTRL_NEW;
4428         spin_lock_init(&ctrl->lock);
4429         mutex_init(&ctrl->scan_lock);
4430         INIT_LIST_HEAD(&ctrl->namespaces);
4431         xa_init(&ctrl->cels);
4432         init_rwsem(&ctrl->namespaces_rwsem);
4433         ctrl->dev = dev;
4434         ctrl->ops = ops;
4435         ctrl->quirks = quirks;
4436         ctrl->numa_node = NUMA_NO_NODE;
4437         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4438         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4439         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4440         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4441         init_waitqueue_head(&ctrl->state_wq);
4442
4443         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4444         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4445         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4446
4447         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4448                         PAGE_SIZE);
4449         ctrl->discard_page = alloc_page(GFP_KERNEL);
4450         if (!ctrl->discard_page) {
4451                 ret = -ENOMEM;
4452                 goto out;
4453         }
4454
4455         ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
4456         if (ret < 0)
4457                 goto out;
4458         ctrl->instance = ret;
4459
4460         device_initialize(&ctrl->ctrl_device);
4461         ctrl->device = &ctrl->ctrl_device;
4462         ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
4463         ctrl->device->class = nvme_class;
4464         ctrl->device->parent = ctrl->dev;
4465         ctrl->device->groups = nvme_dev_attr_groups;
4466         ctrl->device->release = nvme_free_ctrl;
4467         dev_set_drvdata(ctrl->device, ctrl);
4468         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4469         if (ret)
4470                 goto out_release_instance;
4471
4472         nvme_get_ctrl(ctrl);
4473         cdev_init(&ctrl->cdev, &nvme_dev_fops);
4474         ctrl->cdev.owner = ops->module;
4475         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4476         if (ret)
4477                 goto out_free_name;
4478
4479         /*
4480          * Initialize latency tolerance controls.  The sysfs files won't
4481          * be visible to userspace unless the device actually supports APST.
4482          */
4483         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4484         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4485                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4486
4487         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4488
4489         return 0;
4490 out_free_name:
4491         nvme_put_ctrl(ctrl);
4492         kfree_const(ctrl->device->kobj.name);
4493 out_release_instance:
4494         ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4495 out:
4496         if (ctrl->discard_page)
4497                 __free_page(ctrl->discard_page);
4498         return ret;
4499 }
4500 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4501
4502 /**
4503  * nvme_kill_queues(): Ends all namespace queues
4504  * @ctrl: the dead controller that needs to end
4505  *
4506  * Call this function when the driver determines it is unable to get the
4507  * controller in a state capable of servicing IO.
4508  */
4509 void nvme_kill_queues(struct nvme_ctrl *ctrl)
4510 {
4511         struct nvme_ns *ns;
4512
4513         down_read(&ctrl->namespaces_rwsem);
4514
4515         /* Forcibly unquiesce queues to avoid blocking dispatch */
4516         if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
4517                 blk_mq_unquiesce_queue(ctrl->admin_q);
4518
4519         list_for_each_entry(ns, &ctrl->namespaces, list)
4520                 nvme_set_queue_dying(ns);
4521
4522         up_read(&ctrl->namespaces_rwsem);
4523 }
4524 EXPORT_SYMBOL_GPL(nvme_kill_queues);
4525
4526 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4527 {
4528         struct nvme_ns *ns;
4529
4530         down_read(&ctrl->namespaces_rwsem);
4531         list_for_each_entry(ns, &ctrl->namespaces, list)
4532                 blk_mq_unfreeze_queue(ns->queue);
4533         up_read(&ctrl->namespaces_rwsem);
4534 }
4535 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4536
4537 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4538 {
4539         struct nvme_ns *ns;
4540
4541         down_read(&ctrl->namespaces_rwsem);
4542         list_for_each_entry(ns, &ctrl->namespaces, list) {
4543                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4544                 if (timeout <= 0)
4545                         break;
4546         }
4547         up_read(&ctrl->namespaces_rwsem);
4548         return timeout;
4549 }
4550 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4551
4552 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4553 {
4554         struct nvme_ns *ns;
4555
4556         down_read(&ctrl->namespaces_rwsem);
4557         list_for_each_entry(ns, &ctrl->namespaces, list)
4558                 blk_mq_freeze_queue_wait(ns->queue);
4559         up_read(&ctrl->namespaces_rwsem);
4560 }
4561 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4562
4563 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4564 {
4565         struct nvme_ns *ns;
4566
4567         down_read(&ctrl->namespaces_rwsem);
4568         list_for_each_entry(ns, &ctrl->namespaces, list)
4569                 blk_freeze_queue_start(ns->queue);
4570         up_read(&ctrl->namespaces_rwsem);
4571 }
4572 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4573
4574 void nvme_stop_queues(struct nvme_ctrl *ctrl)
4575 {
4576         struct nvme_ns *ns;
4577
4578         down_read(&ctrl->namespaces_rwsem);
4579         list_for_each_entry(ns, &ctrl->namespaces, list)
4580                 blk_mq_quiesce_queue(ns->queue);
4581         up_read(&ctrl->namespaces_rwsem);
4582 }
4583 EXPORT_SYMBOL_GPL(nvme_stop_queues);
4584
4585 void nvme_start_queues(struct nvme_ctrl *ctrl)
4586 {
4587         struct nvme_ns *ns;
4588
4589         down_read(&ctrl->namespaces_rwsem);
4590         list_for_each_entry(ns, &ctrl->namespaces, list)
4591                 blk_mq_unquiesce_queue(ns->queue);
4592         up_read(&ctrl->namespaces_rwsem);
4593 }
4594 EXPORT_SYMBOL_GPL(nvme_start_queues);
4595
4596
4597 void nvme_sync_queues(struct nvme_ctrl *ctrl)
4598 {
4599         struct nvme_ns *ns;
4600
4601         down_read(&ctrl->namespaces_rwsem);
4602         list_for_each_entry(ns, &ctrl->namespaces, list)
4603                 blk_sync_queue(ns->queue);
4604         up_read(&ctrl->namespaces_rwsem);
4605
4606         if (ctrl->admin_q)
4607                 blk_sync_queue(ctrl->admin_q);
4608 }
4609 EXPORT_SYMBOL_GPL(nvme_sync_queues);
4610
4611 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4612 {
4613         if (file->f_op != &nvme_dev_fops)
4614                 return NULL;
4615         return file->private_data;
4616 }
4617 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4618
4619 /*
4620  * Check we didn't inadvertently grow the command structure sizes:
4621  */
4622 static inline void _nvme_check_size(void)
4623 {
4624         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4625         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4626         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4627         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4628         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4629         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4630         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4631         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4632         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4633         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
4634         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
4635         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
4636         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4637         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4638         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4639         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
4640         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
4641         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
4642         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4643 }
4644
4645
4646 static int __init nvme_core_init(void)
4647 {
4648         int result = -ENOMEM;
4649
4650         _nvme_check_size();
4651
4652         nvme_wq = alloc_workqueue("nvme-wq",
4653                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4654         if (!nvme_wq)
4655                 goto out;
4656
4657         nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
4658                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4659         if (!nvme_reset_wq)
4660                 goto destroy_wq;
4661
4662         nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
4663                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4664         if (!nvme_delete_wq)
4665                 goto destroy_reset_wq;
4666
4667         result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
4668         if (result < 0)
4669                 goto destroy_delete_wq;
4670
4671         nvme_class = class_create(THIS_MODULE, "nvme");
4672         if (IS_ERR(nvme_class)) {
4673                 result = PTR_ERR(nvme_class);
4674                 goto unregister_chrdev;
4675         }
4676         nvme_class->dev_uevent = nvme_class_uevent;
4677
4678         nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
4679         if (IS_ERR(nvme_subsys_class)) {
4680                 result = PTR_ERR(nvme_subsys_class);
4681                 goto destroy_class;
4682         }
4683         return 0;
4684
4685 destroy_class:
4686         class_destroy(nvme_class);
4687 unregister_chrdev:
4688         unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
4689 destroy_delete_wq:
4690         destroy_workqueue(nvme_delete_wq);
4691 destroy_reset_wq:
4692         destroy_workqueue(nvme_reset_wq);
4693 destroy_wq:
4694         destroy_workqueue(nvme_wq);
4695 out:
4696         return result;
4697 }
4698
4699 static void __exit nvme_core_exit(void)
4700 {
4701         class_destroy(nvme_subsys_class);
4702         class_destroy(nvme_class);
4703         unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
4704         destroy_workqueue(nvme_delete_wq);
4705         destroy_workqueue(nvme_reset_wq);
4706         destroy_workqueue(nvme_wq);
4707         ida_destroy(&nvme_instance_ida);
4708 }
4709
4710 MODULE_LICENSE("GPL");
4711 MODULE_VERSION("1.0");
4712 module_init(nvme_core_init);
4713 module_exit(nvme_core_exit);