Merge tag 'for-5.18/write-streams-2022-03-18' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / drivers / nvme / host / core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVM Express device driver
4  * Copyright (c) 2011-2014, Intel Corporation.
5  */
6
7 #include <linux/blkdev.h>
8 #include <linux/blk-mq.h>
9 #include <linux/blk-integrity.h>
10 #include <linux/compat.h>
11 #include <linux/delay.h>
12 #include <linux/errno.h>
13 #include <linux/hdreg.h>
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/backing-dev.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/pr.h>
20 #include <linux/ptrace.h>
21 #include <linux/nvme_ioctl.h>
22 #include <linux/pm_qos.h>
23 #include <asm/unaligned.h>
24
25 #include "nvme.h"
26 #include "fabrics.h"
27
28 #define CREATE_TRACE_POINTS
29 #include "trace.h"
30
31 #define NVME_MINORS             (1U << MINORBITS)
32
33 unsigned int admin_timeout = 60;
34 module_param(admin_timeout, uint, 0644);
35 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
36 EXPORT_SYMBOL_GPL(admin_timeout);
37
38 unsigned int nvme_io_timeout = 30;
39 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
40 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
41 EXPORT_SYMBOL_GPL(nvme_io_timeout);
42
43 static unsigned char shutdown_timeout = 5;
44 module_param(shutdown_timeout, byte, 0644);
45 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
46
47 static u8 nvme_max_retries = 5;
48 module_param_named(max_retries, nvme_max_retries, byte, 0644);
49 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
50
51 static unsigned long default_ps_max_latency_us = 100000;
52 module_param(default_ps_max_latency_us, ulong, 0644);
53 MODULE_PARM_DESC(default_ps_max_latency_us,
54                  "max power saving latency for new devices; use PM QOS to change per device");
55
56 static bool force_apst;
57 module_param(force_apst, bool, 0644);
58 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
59
60 static unsigned long apst_primary_timeout_ms = 100;
61 module_param(apst_primary_timeout_ms, ulong, 0644);
62 MODULE_PARM_DESC(apst_primary_timeout_ms,
63         "primary APST timeout in ms");
64
65 static unsigned long apst_secondary_timeout_ms = 2000;
66 module_param(apst_secondary_timeout_ms, ulong, 0644);
67 MODULE_PARM_DESC(apst_secondary_timeout_ms,
68         "secondary APST timeout in ms");
69
70 static unsigned long apst_primary_latency_tol_us = 15000;
71 module_param(apst_primary_latency_tol_us, ulong, 0644);
72 MODULE_PARM_DESC(apst_primary_latency_tol_us,
73         "primary APST latency tolerance in us");
74
75 static unsigned long apst_secondary_latency_tol_us = 100000;
76 module_param(apst_secondary_latency_tol_us, ulong, 0644);
77 MODULE_PARM_DESC(apst_secondary_latency_tol_us,
78         "secondary APST latency tolerance in us");
79
80 /*
81  * nvme_wq - hosts nvme related works that are not reset or delete
82  * nvme_reset_wq - hosts nvme reset works
83  * nvme_delete_wq - hosts nvme delete works
84  *
85  * nvme_wq will host works such as scan, aen handling, fw activation,
86  * keep-alive, periodic reconnects etc. nvme_reset_wq
87  * runs reset works which also flush works hosted on nvme_wq for
88  * serialization purposes. nvme_delete_wq host controller deletion
89  * works which flush reset works for serialization.
90  */
91 struct workqueue_struct *nvme_wq;
92 EXPORT_SYMBOL_GPL(nvme_wq);
93
94 struct workqueue_struct *nvme_reset_wq;
95 EXPORT_SYMBOL_GPL(nvme_reset_wq);
96
97 struct workqueue_struct *nvme_delete_wq;
98 EXPORT_SYMBOL_GPL(nvme_delete_wq);
99
100 static LIST_HEAD(nvme_subsystems);
101 static DEFINE_MUTEX(nvme_subsystems_lock);
102
103 static DEFINE_IDA(nvme_instance_ida);
104 static dev_t nvme_ctrl_base_chr_devt;
105 static struct class *nvme_class;
106 static struct class *nvme_subsys_class;
107
108 static DEFINE_IDA(nvme_ns_chr_minor_ida);
109 static dev_t nvme_ns_chr_devt;
110 static struct class *nvme_ns_chr_class;
111
112 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
113 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
114                                            unsigned nsid);
115 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
116                                    struct nvme_command *cmd);
117
118 void nvme_queue_scan(struct nvme_ctrl *ctrl)
119 {
120         /*
121          * Only new queue scan work when admin and IO queues are both alive
122          */
123         if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
124                 queue_work(nvme_wq, &ctrl->scan_work);
125 }
126
127 /*
128  * Use this function to proceed with scheduling reset_work for a controller
129  * that had previously been set to the resetting state. This is intended for
130  * code paths that can't be interrupted by other reset attempts. A hot removal
131  * may prevent this from succeeding.
132  */
133 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
134 {
135         if (ctrl->state != NVME_CTRL_RESETTING)
136                 return -EBUSY;
137         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
138                 return -EBUSY;
139         return 0;
140 }
141 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
142
143 static void nvme_failfast_work(struct work_struct *work)
144 {
145         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
146                         struct nvme_ctrl, failfast_work);
147
148         if (ctrl->state != NVME_CTRL_CONNECTING)
149                 return;
150
151         set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
152         dev_info(ctrl->device, "failfast expired\n");
153         nvme_kick_requeue_lists(ctrl);
154 }
155
156 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
157 {
158         if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
159                 return;
160
161         schedule_delayed_work(&ctrl->failfast_work,
162                               ctrl->opts->fast_io_fail_tmo * HZ);
163 }
164
165 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
166 {
167         if (!ctrl->opts)
168                 return;
169
170         cancel_delayed_work_sync(&ctrl->failfast_work);
171         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
172 }
173
174
175 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
176 {
177         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
178                 return -EBUSY;
179         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
180                 return -EBUSY;
181         return 0;
182 }
183 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
184
185 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
186 {
187         int ret;
188
189         ret = nvme_reset_ctrl(ctrl);
190         if (!ret) {
191                 flush_work(&ctrl->reset_work);
192                 if (ctrl->state != NVME_CTRL_LIVE)
193                         ret = -ENETRESET;
194         }
195
196         return ret;
197 }
198
199 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
200 {
201         dev_info(ctrl->device,
202                  "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
203
204         flush_work(&ctrl->reset_work);
205         nvme_stop_ctrl(ctrl);
206         nvme_remove_namespaces(ctrl);
207         ctrl->ops->delete_ctrl(ctrl);
208         nvme_uninit_ctrl(ctrl);
209 }
210
211 static void nvme_delete_ctrl_work(struct work_struct *work)
212 {
213         struct nvme_ctrl *ctrl =
214                 container_of(work, struct nvme_ctrl, delete_work);
215
216         nvme_do_delete_ctrl(ctrl);
217 }
218
219 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
220 {
221         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
222                 return -EBUSY;
223         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
224                 return -EBUSY;
225         return 0;
226 }
227 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
228
229 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
230 {
231         /*
232          * Keep a reference until nvme_do_delete_ctrl() complete,
233          * since ->delete_ctrl can free the controller.
234          */
235         nvme_get_ctrl(ctrl);
236         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
237                 nvme_do_delete_ctrl(ctrl);
238         nvme_put_ctrl(ctrl);
239 }
240
241 static blk_status_t nvme_error_status(u16 status)
242 {
243         switch (status & 0x7ff) {
244         case NVME_SC_SUCCESS:
245                 return BLK_STS_OK;
246         case NVME_SC_CAP_EXCEEDED:
247                 return BLK_STS_NOSPC;
248         case NVME_SC_LBA_RANGE:
249         case NVME_SC_CMD_INTERRUPTED:
250         case NVME_SC_NS_NOT_READY:
251                 return BLK_STS_TARGET;
252         case NVME_SC_BAD_ATTRIBUTES:
253         case NVME_SC_ONCS_NOT_SUPPORTED:
254         case NVME_SC_INVALID_OPCODE:
255         case NVME_SC_INVALID_FIELD:
256         case NVME_SC_INVALID_NS:
257                 return BLK_STS_NOTSUPP;
258         case NVME_SC_WRITE_FAULT:
259         case NVME_SC_READ_ERROR:
260         case NVME_SC_UNWRITTEN_BLOCK:
261         case NVME_SC_ACCESS_DENIED:
262         case NVME_SC_READ_ONLY:
263         case NVME_SC_COMPARE_FAILED:
264                 return BLK_STS_MEDIUM;
265         case NVME_SC_GUARD_CHECK:
266         case NVME_SC_APPTAG_CHECK:
267         case NVME_SC_REFTAG_CHECK:
268         case NVME_SC_INVALID_PI:
269                 return BLK_STS_PROTECTION;
270         case NVME_SC_RESERVATION_CONFLICT:
271                 return BLK_STS_NEXUS;
272         case NVME_SC_HOST_PATH_ERROR:
273                 return BLK_STS_TRANSPORT;
274         case NVME_SC_ZONE_TOO_MANY_ACTIVE:
275                 return BLK_STS_ZONE_ACTIVE_RESOURCE;
276         case NVME_SC_ZONE_TOO_MANY_OPEN:
277                 return BLK_STS_ZONE_OPEN_RESOURCE;
278         default:
279                 return BLK_STS_IOERR;
280         }
281 }
282
283 static void nvme_retry_req(struct request *req)
284 {
285         unsigned long delay = 0;
286         u16 crd;
287
288         /* The mask and shift result must be <= 3 */
289         crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
290         if (crd)
291                 delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
292
293         nvme_req(req)->retries++;
294         blk_mq_requeue_request(req, false);
295         blk_mq_delay_kick_requeue_list(req->q, delay);
296 }
297
298 static void nvme_log_error(struct request *req)
299 {
300         struct nvme_ns *ns = req->q->queuedata;
301         struct nvme_request *nr = nvme_req(req);
302
303         if (ns) {
304                 pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
305                        ns->disk ? ns->disk->disk_name : "?",
306                        nvme_get_opcode_str(nr->cmd->common.opcode),
307                        nr->cmd->common.opcode,
308                        (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
309                        (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
310                        nvme_get_error_status_str(nr->status),
311                        nr->status >> 8 & 7,     /* Status Code Type */
312                        nr->status & 0xff,       /* Status Code */
313                        nr->status & NVME_SC_MORE ? "MORE " : "",
314                        nr->status & NVME_SC_DNR  ? "DNR "  : "");
315                 return;
316         }
317
318         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
319                            dev_name(nr->ctrl->device),
320                            nvme_get_admin_opcode_str(nr->cmd->common.opcode),
321                            nr->cmd->common.opcode,
322                            nvme_get_error_status_str(nr->status),
323                            nr->status >> 8 & 7, /* Status Code Type */
324                            nr->status & 0xff,   /* Status Code */
325                            nr->status & NVME_SC_MORE ? "MORE " : "",
326                            nr->status & NVME_SC_DNR  ? "DNR "  : "");
327 }
328
329 enum nvme_disposition {
330         COMPLETE,
331         RETRY,
332         FAILOVER,
333 };
334
335 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
336 {
337         if (likely(nvme_req(req)->status == 0))
338                 return COMPLETE;
339
340         if (blk_noretry_request(req) ||
341             (nvme_req(req)->status & NVME_SC_DNR) ||
342             nvme_req(req)->retries >= nvme_max_retries)
343                 return COMPLETE;
344
345         if (req->cmd_flags & REQ_NVME_MPATH) {
346                 if (nvme_is_path_error(nvme_req(req)->status) ||
347                     blk_queue_dying(req->q))
348                         return FAILOVER;
349         } else {
350                 if (blk_queue_dying(req->q))
351                         return COMPLETE;
352         }
353
354         return RETRY;
355 }
356
357 static inline void nvme_end_req_zoned(struct request *req)
358 {
359         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
360             req_op(req) == REQ_OP_ZONE_APPEND)
361                 req->__sector = nvme_lba_to_sect(req->q->queuedata,
362                         le64_to_cpu(nvme_req(req)->result.u64));
363 }
364
365 static inline void nvme_end_req(struct request *req)
366 {
367         blk_status_t status = nvme_error_status(nvme_req(req)->status);
368
369         if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS))
370                 nvme_log_error(req);
371         nvme_end_req_zoned(req);
372         nvme_trace_bio_complete(req);
373         blk_mq_end_request(req, status);
374 }
375
376 void nvme_complete_rq(struct request *req)
377 {
378         trace_nvme_complete_rq(req);
379         nvme_cleanup_cmd(req);
380
381         if (nvme_req(req)->ctrl->kas)
382                 nvme_req(req)->ctrl->comp_seen = true;
383
384         switch (nvme_decide_disposition(req)) {
385         case COMPLETE:
386                 nvme_end_req(req);
387                 return;
388         case RETRY:
389                 nvme_retry_req(req);
390                 return;
391         case FAILOVER:
392                 nvme_failover_req(req);
393                 return;
394         }
395 }
396 EXPORT_SYMBOL_GPL(nvme_complete_rq);
397
398 void nvme_complete_batch_req(struct request *req)
399 {
400         trace_nvme_complete_rq(req);
401         nvme_cleanup_cmd(req);
402         nvme_end_req_zoned(req);
403 }
404 EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
405
406 /*
407  * Called to unwind from ->queue_rq on a failed command submission so that the
408  * multipathing code gets called to potentially failover to another path.
409  * The caller needs to unwind all transport specific resource allocations and
410  * must return propagate the return value.
411  */
412 blk_status_t nvme_host_path_error(struct request *req)
413 {
414         nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
415         blk_mq_set_request_complete(req);
416         nvme_complete_rq(req);
417         return BLK_STS_OK;
418 }
419 EXPORT_SYMBOL_GPL(nvme_host_path_error);
420
421 bool nvme_cancel_request(struct request *req, void *data, bool reserved)
422 {
423         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
424                                 "Cancelling I/O %d", req->tag);
425
426         /* don't abort one completed request */
427         if (blk_mq_request_completed(req))
428                 return true;
429
430         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
431         nvme_req(req)->flags |= NVME_REQ_CANCELLED;
432         blk_mq_complete_request(req);
433         return true;
434 }
435 EXPORT_SYMBOL_GPL(nvme_cancel_request);
436
437 void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
438 {
439         if (ctrl->tagset) {
440                 blk_mq_tagset_busy_iter(ctrl->tagset,
441                                 nvme_cancel_request, ctrl);
442                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
443         }
444 }
445 EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
446
447 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
448 {
449         if (ctrl->admin_tagset) {
450                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
451                                 nvme_cancel_request, ctrl);
452                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
453         }
454 }
455 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
456
457 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
458                 enum nvme_ctrl_state new_state)
459 {
460         enum nvme_ctrl_state old_state;
461         unsigned long flags;
462         bool changed = false;
463
464         spin_lock_irqsave(&ctrl->lock, flags);
465
466         old_state = ctrl->state;
467         switch (new_state) {
468         case NVME_CTRL_LIVE:
469                 switch (old_state) {
470                 case NVME_CTRL_NEW:
471                 case NVME_CTRL_RESETTING:
472                 case NVME_CTRL_CONNECTING:
473                         changed = true;
474                         fallthrough;
475                 default:
476                         break;
477                 }
478                 break;
479         case NVME_CTRL_RESETTING:
480                 switch (old_state) {
481                 case NVME_CTRL_NEW:
482                 case NVME_CTRL_LIVE:
483                         changed = true;
484                         fallthrough;
485                 default:
486                         break;
487                 }
488                 break;
489         case NVME_CTRL_CONNECTING:
490                 switch (old_state) {
491                 case NVME_CTRL_NEW:
492                 case NVME_CTRL_RESETTING:
493                         changed = true;
494                         fallthrough;
495                 default:
496                         break;
497                 }
498                 break;
499         case NVME_CTRL_DELETING:
500                 switch (old_state) {
501                 case NVME_CTRL_LIVE:
502                 case NVME_CTRL_RESETTING:
503                 case NVME_CTRL_CONNECTING:
504                         changed = true;
505                         fallthrough;
506                 default:
507                         break;
508                 }
509                 break;
510         case NVME_CTRL_DELETING_NOIO:
511                 switch (old_state) {
512                 case NVME_CTRL_DELETING:
513                 case NVME_CTRL_DEAD:
514                         changed = true;
515                         fallthrough;
516                 default:
517                         break;
518                 }
519                 break;
520         case NVME_CTRL_DEAD:
521                 switch (old_state) {
522                 case NVME_CTRL_DELETING:
523                         changed = true;
524                         fallthrough;
525                 default:
526                         break;
527                 }
528                 break;
529         default:
530                 break;
531         }
532
533         if (changed) {
534                 ctrl->state = new_state;
535                 wake_up_all(&ctrl->state_wq);
536         }
537
538         spin_unlock_irqrestore(&ctrl->lock, flags);
539         if (!changed)
540                 return false;
541
542         if (ctrl->state == NVME_CTRL_LIVE) {
543                 if (old_state == NVME_CTRL_CONNECTING)
544                         nvme_stop_failfast_work(ctrl);
545                 nvme_kick_requeue_lists(ctrl);
546         } else if (ctrl->state == NVME_CTRL_CONNECTING &&
547                 old_state == NVME_CTRL_RESETTING) {
548                 nvme_start_failfast_work(ctrl);
549         }
550         return changed;
551 }
552 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
553
554 /*
555  * Returns true for sink states that can't ever transition back to live.
556  */
557 static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
558 {
559         switch (ctrl->state) {
560         case NVME_CTRL_NEW:
561         case NVME_CTRL_LIVE:
562         case NVME_CTRL_RESETTING:
563         case NVME_CTRL_CONNECTING:
564                 return false;
565         case NVME_CTRL_DELETING:
566         case NVME_CTRL_DELETING_NOIO:
567         case NVME_CTRL_DEAD:
568                 return true;
569         default:
570                 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
571                 return true;
572         }
573 }
574
575 /*
576  * Waits for the controller state to be resetting, or returns false if it is
577  * not possible to ever transition to that state.
578  */
579 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
580 {
581         wait_event(ctrl->state_wq,
582                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
583                    nvme_state_terminal(ctrl));
584         return ctrl->state == NVME_CTRL_RESETTING;
585 }
586 EXPORT_SYMBOL_GPL(nvme_wait_reset);
587
588 static void nvme_free_ns_head(struct kref *ref)
589 {
590         struct nvme_ns_head *head =
591                 container_of(ref, struct nvme_ns_head, ref);
592
593         nvme_mpath_remove_disk(head);
594         ida_free(&head->subsys->ns_ida, head->instance);
595         cleanup_srcu_struct(&head->srcu);
596         nvme_put_subsystem(head->subsys);
597         kfree(head);
598 }
599
600 bool nvme_tryget_ns_head(struct nvme_ns_head *head)
601 {
602         return kref_get_unless_zero(&head->ref);
603 }
604
605 void nvme_put_ns_head(struct nvme_ns_head *head)
606 {
607         kref_put(&head->ref, nvme_free_ns_head);
608 }
609
610 static void nvme_free_ns(struct kref *kref)
611 {
612         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
613
614         put_disk(ns->disk);
615         nvme_put_ns_head(ns->head);
616         nvme_put_ctrl(ns->ctrl);
617         kfree(ns);
618 }
619
620 static inline bool nvme_get_ns(struct nvme_ns *ns)
621 {
622         return kref_get_unless_zero(&ns->kref);
623 }
624
625 void nvme_put_ns(struct nvme_ns *ns)
626 {
627         kref_put(&ns->kref, nvme_free_ns);
628 }
629 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
630
631 static inline void nvme_clear_nvme_request(struct request *req)
632 {
633         nvme_req(req)->status = 0;
634         nvme_req(req)->retries = 0;
635         nvme_req(req)->flags = 0;
636         req->rq_flags |= RQF_DONTPREP;
637 }
638
639 /* initialize a passthrough request */
640 void nvme_init_request(struct request *req, struct nvme_command *cmd)
641 {
642         if (req->q->queuedata)
643                 req->timeout = NVME_IO_TIMEOUT;
644         else /* no queuedata implies admin queue */
645                 req->timeout = NVME_ADMIN_TIMEOUT;
646
647         /* passthru commands should let the driver set the SGL flags */
648         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
649
650         req->cmd_flags |= REQ_FAILFAST_DRIVER;
651         if (req->mq_hctx->type == HCTX_TYPE_POLL)
652                 req->cmd_flags |= REQ_POLLED;
653         nvme_clear_nvme_request(req);
654         memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
655 }
656 EXPORT_SYMBOL_GPL(nvme_init_request);
657
658 /*
659  * For something we're not in a state to send to the device the default action
660  * is to busy it and retry it after the controller state is recovered.  However,
661  * if the controller is deleting or if anything is marked for failfast or
662  * nvme multipath it is immediately failed.
663  *
664  * Note: commands used to initialize the controller will be marked for failfast.
665  * Note: nvme cli/ioctl commands are marked for failfast.
666  */
667 blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
668                 struct request *rq)
669 {
670         if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
671             ctrl->state != NVME_CTRL_DELETING &&
672             ctrl->state != NVME_CTRL_DEAD &&
673             !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
674             !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
675                 return BLK_STS_RESOURCE;
676         return nvme_host_path_error(rq);
677 }
678 EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
679
680 bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
681                 bool queue_live)
682 {
683         struct nvme_request *req = nvme_req(rq);
684
685         /*
686          * currently we have a problem sending passthru commands
687          * on the admin_q if the controller is not LIVE because we can't
688          * make sure that they are going out after the admin connect,
689          * controller enable and/or other commands in the initialization
690          * sequence. until the controller will be LIVE, fail with
691          * BLK_STS_RESOURCE so that they will be rescheduled.
692          */
693         if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
694                 return false;
695
696         if (ctrl->ops->flags & NVME_F_FABRICS) {
697                 /*
698                  * Only allow commands on a live queue, except for the connect
699                  * command, which is require to set the queue live in the
700                  * appropinquate states.
701                  */
702                 switch (ctrl->state) {
703                 case NVME_CTRL_CONNECTING:
704                         if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
705                             req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
706                                 return true;
707                         break;
708                 default:
709                         break;
710                 case NVME_CTRL_DEAD:
711                         return false;
712                 }
713         }
714
715         return queue_live;
716 }
717 EXPORT_SYMBOL_GPL(__nvme_check_ready);
718
719 static inline void nvme_setup_flush(struct nvme_ns *ns,
720                 struct nvme_command *cmnd)
721 {
722         memset(cmnd, 0, sizeof(*cmnd));
723         cmnd->common.opcode = nvme_cmd_flush;
724         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
725 }
726
727 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
728                 struct nvme_command *cmnd)
729 {
730         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
731         struct nvme_dsm_range *range;
732         struct bio *bio;
733
734         /*
735          * Some devices do not consider the DSM 'Number of Ranges' field when
736          * determining how much data to DMA. Always allocate memory for maximum
737          * number of segments to prevent device reading beyond end of buffer.
738          */
739         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
740
741         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
742         if (!range) {
743                 /*
744                  * If we fail allocation our range, fallback to the controller
745                  * discard page. If that's also busy, it's safe to return
746                  * busy, as we know we can make progress once that's freed.
747                  */
748                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
749                         return BLK_STS_RESOURCE;
750
751                 range = page_address(ns->ctrl->discard_page);
752         }
753
754         __rq_for_each_bio(bio, req) {
755                 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
756                 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
757
758                 if (n < segments) {
759                         range[n].cattr = cpu_to_le32(0);
760                         range[n].nlb = cpu_to_le32(nlb);
761                         range[n].slba = cpu_to_le64(slba);
762                 }
763                 n++;
764         }
765
766         if (WARN_ON_ONCE(n != segments)) {
767                 if (virt_to_page(range) == ns->ctrl->discard_page)
768                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
769                 else
770                         kfree(range);
771                 return BLK_STS_IOERR;
772         }
773
774         memset(cmnd, 0, sizeof(*cmnd));
775         cmnd->dsm.opcode = nvme_cmd_dsm;
776         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
777         cmnd->dsm.nr = cpu_to_le32(segments - 1);
778         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
779
780         req->special_vec.bv_page = virt_to_page(range);
781         req->special_vec.bv_offset = offset_in_page(range);
782         req->special_vec.bv_len = alloc_size;
783         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
784
785         return BLK_STS_OK;
786 }
787
788 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
789                 struct request *req, struct nvme_command *cmnd)
790 {
791         memset(cmnd, 0, sizeof(*cmnd));
792
793         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
794                 return nvme_setup_discard(ns, req, cmnd);
795
796         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
797         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
798         cmnd->write_zeroes.slba =
799                 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
800         cmnd->write_zeroes.length =
801                 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
802
803         if (nvme_ns_has_pi(ns)) {
804                 cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
805
806                 switch (ns->pi_type) {
807                 case NVME_NS_DPS_PI_TYPE1:
808                 case NVME_NS_DPS_PI_TYPE2:
809                         cmnd->write_zeroes.reftag =
810                                 cpu_to_le32(t10_pi_ref_tag(req));
811                         break;
812                 }
813         }
814
815         return BLK_STS_OK;
816 }
817
818 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
819                 struct request *req, struct nvme_command *cmnd,
820                 enum nvme_opcode op)
821 {
822         u16 control = 0;
823         u32 dsmgmt = 0;
824
825         if (req->cmd_flags & REQ_FUA)
826                 control |= NVME_RW_FUA;
827         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
828                 control |= NVME_RW_LR;
829
830         if (req->cmd_flags & REQ_RAHEAD)
831                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
832
833         cmnd->rw.opcode = op;
834         cmnd->rw.flags = 0;
835         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
836         cmnd->rw.rsvd2 = 0;
837         cmnd->rw.metadata = 0;
838         cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
839         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
840         cmnd->rw.reftag = 0;
841         cmnd->rw.apptag = 0;
842         cmnd->rw.appmask = 0;
843
844         if (ns->ms) {
845                 /*
846                  * If formated with metadata, the block layer always provides a
847                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
848                  * we enable the PRACT bit for protection information or set the
849                  * namespace capacity to zero to prevent any I/O.
850                  */
851                 if (!blk_integrity_rq(req)) {
852                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
853                                 return BLK_STS_NOTSUPP;
854                         control |= NVME_RW_PRINFO_PRACT;
855                 }
856
857                 switch (ns->pi_type) {
858                 case NVME_NS_DPS_PI_TYPE3:
859                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
860                         break;
861                 case NVME_NS_DPS_PI_TYPE1:
862                 case NVME_NS_DPS_PI_TYPE2:
863                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
864                                         NVME_RW_PRINFO_PRCHK_REF;
865                         if (op == nvme_cmd_zone_append)
866                                 control |= NVME_RW_APPEND_PIREMAP;
867                         cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
868                         break;
869                 }
870         }
871
872         cmnd->rw.control = cpu_to_le16(control);
873         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
874         return 0;
875 }
876
877 void nvme_cleanup_cmd(struct request *req)
878 {
879         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
880                 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
881
882                 if (req->special_vec.bv_page == ctrl->discard_page)
883                         clear_bit_unlock(0, &ctrl->discard_page_busy);
884                 else
885                         kfree(bvec_virt(&req->special_vec));
886         }
887 }
888 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
889
890 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
891 {
892         struct nvme_command *cmd = nvme_req(req)->cmd;
893         blk_status_t ret = BLK_STS_OK;
894
895         if (!(req->rq_flags & RQF_DONTPREP))
896                 nvme_clear_nvme_request(req);
897
898         switch (req_op(req)) {
899         case REQ_OP_DRV_IN:
900         case REQ_OP_DRV_OUT:
901                 /* these are setup prior to execution in nvme_init_request() */
902                 break;
903         case REQ_OP_FLUSH:
904                 nvme_setup_flush(ns, cmd);
905                 break;
906         case REQ_OP_ZONE_RESET_ALL:
907         case REQ_OP_ZONE_RESET:
908                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
909                 break;
910         case REQ_OP_ZONE_OPEN:
911                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
912                 break;
913         case REQ_OP_ZONE_CLOSE:
914                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
915                 break;
916         case REQ_OP_ZONE_FINISH:
917                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
918                 break;
919         case REQ_OP_WRITE_ZEROES:
920                 ret = nvme_setup_write_zeroes(ns, req, cmd);
921                 break;
922         case REQ_OP_DISCARD:
923                 ret = nvme_setup_discard(ns, req, cmd);
924                 break;
925         case REQ_OP_READ:
926                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
927                 break;
928         case REQ_OP_WRITE:
929                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
930                 break;
931         case REQ_OP_ZONE_APPEND:
932                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
933                 break;
934         default:
935                 WARN_ON_ONCE(1);
936                 return BLK_STS_IOERR;
937         }
938
939         cmd->common.command_id = nvme_cid(req);
940         trace_nvme_setup_cmd(req, cmd);
941         return ret;
942 }
943 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
944
945 /*
946  * Return values:
947  * 0:  success
948  * >0: nvme controller's cqe status response
949  * <0: kernel error in lieu of controller response
950  */
951 static int nvme_execute_rq(struct request *rq, bool at_head)
952 {
953         blk_status_t status;
954
955         status = blk_execute_rq(rq, at_head);
956         if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
957                 return -EINTR;
958         if (nvme_req(rq)->status)
959                 return nvme_req(rq)->status;
960         return blk_status_to_errno(status);
961 }
962
963 /*
964  * Returns 0 on success.  If the result is negative, it's a Linux error code;
965  * if the result is positive, it's an NVM Express status code
966  */
967 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
968                 union nvme_result *result, void *buffer, unsigned bufflen,
969                 unsigned timeout, int qid, int at_head,
970                 blk_mq_req_flags_t flags)
971 {
972         struct request *req;
973         int ret;
974
975         if (qid == NVME_QID_ANY)
976                 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
977         else
978                 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
979                                                 qid ? qid - 1 : 0);
980
981         if (IS_ERR(req))
982                 return PTR_ERR(req);
983         nvme_init_request(req, cmd);
984
985         if (timeout)
986                 req->timeout = timeout;
987
988         if (buffer && bufflen) {
989                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
990                 if (ret)
991                         goto out;
992         }
993
994         ret = nvme_execute_rq(req, at_head);
995         if (result && ret >= 0)
996                 *result = nvme_req(req)->result;
997  out:
998         blk_mq_free_request(req);
999         return ret;
1000 }
1001 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1002
1003 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1004                 void *buffer, unsigned bufflen)
1005 {
1006         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
1007                         NVME_QID_ANY, 0, 0);
1008 }
1009 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1010
1011 static u32 nvme_known_admin_effects(u8 opcode)
1012 {
1013         switch (opcode) {
1014         case nvme_admin_format_nvm:
1015                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
1016                         NVME_CMD_EFFECTS_CSE_MASK;
1017         case nvme_admin_sanitize_nvm:
1018                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
1019         default:
1020                 break;
1021         }
1022         return 0;
1023 }
1024
1025 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1026 {
1027         u32 effects = 0;
1028
1029         if (ns) {
1030                 if (ns->head->effects)
1031                         effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1032                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1033                         dev_warn_once(ctrl->device,
1034                                 "IO command:%02x has unhandled effects:%08x\n",
1035                                 opcode, effects);
1036                 return 0;
1037         }
1038
1039         if (ctrl->effects)
1040                 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1041         effects |= nvme_known_admin_effects(opcode);
1042
1043         return effects;
1044 }
1045 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1046
1047 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1048                                u8 opcode)
1049 {
1050         u32 effects = nvme_command_effects(ctrl, ns, opcode);
1051
1052         /*
1053          * For simplicity, IO to all namespaces is quiesced even if the command
1054          * effects say only one namespace is affected.
1055          */
1056         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1057                 mutex_lock(&ctrl->scan_lock);
1058                 mutex_lock(&ctrl->subsys->lock);
1059                 nvme_mpath_start_freeze(ctrl->subsys);
1060                 nvme_mpath_wait_freeze(ctrl->subsys);
1061                 nvme_start_freeze(ctrl);
1062                 nvme_wait_freeze(ctrl);
1063         }
1064         return effects;
1065 }
1066
1067 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
1068                               struct nvme_command *cmd, int status)
1069 {
1070         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1071                 nvme_unfreeze(ctrl);
1072                 nvme_mpath_unfreeze(ctrl->subsys);
1073                 mutex_unlock(&ctrl->subsys->lock);
1074                 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1075                 mutex_unlock(&ctrl->scan_lock);
1076         }
1077         if (effects & NVME_CMD_EFFECTS_CCC)
1078                 nvme_init_ctrl_finish(ctrl);
1079         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1080                 nvme_queue_scan(ctrl);
1081                 flush_work(&ctrl->scan_work);
1082         }
1083
1084         switch (cmd->common.opcode) {
1085         case nvme_admin_set_features:
1086                 switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
1087                 case NVME_FEAT_KATO:
1088                         /*
1089                          * Keep alive commands interval on the host should be
1090                          * updated when KATO is modified by Set Features
1091                          * commands.
1092                          */
1093                         if (!status)
1094                                 nvme_update_keep_alive(ctrl, cmd);
1095                         break;
1096                 default:
1097                         break;
1098                 }
1099                 break;
1100         default:
1101                 break;
1102         }
1103 }
1104
1105 int nvme_execute_passthru_rq(struct request *rq)
1106 {
1107         struct nvme_command *cmd = nvme_req(rq)->cmd;
1108         struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1109         struct nvme_ns *ns = rq->q->queuedata;
1110         u32 effects;
1111         int  ret;
1112
1113         effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1114         ret = nvme_execute_rq(rq, false);
1115         if (effects) /* nothing to be done for zero cmd effects */
1116                 nvme_passthru_end(ctrl, effects, cmd, ret);
1117
1118         return ret;
1119 }
1120 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1121
1122 /*
1123  * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1124  * 
1125  *   The host should send Keep Alive commands at half of the Keep Alive Timeout
1126  *   accounting for transport roundtrip times [..].
1127  */
1128 static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1129 {
1130         queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
1131 }
1132
1133 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
1134 {
1135         struct nvme_ctrl *ctrl = rq->end_io_data;
1136         unsigned long flags;
1137         bool startka = false;
1138
1139         blk_mq_free_request(rq);
1140
1141         if (status) {
1142                 dev_err(ctrl->device,
1143                         "failed nvme_keep_alive_end_io error=%d\n",
1144                                 status);
1145                 return;
1146         }
1147
1148         ctrl->comp_seen = false;
1149         spin_lock_irqsave(&ctrl->lock, flags);
1150         if (ctrl->state == NVME_CTRL_LIVE ||
1151             ctrl->state == NVME_CTRL_CONNECTING)
1152                 startka = true;
1153         spin_unlock_irqrestore(&ctrl->lock, flags);
1154         if (startka)
1155                 nvme_queue_keep_alive_work(ctrl);
1156 }
1157
1158 static void nvme_keep_alive_work(struct work_struct *work)
1159 {
1160         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1161                         struct nvme_ctrl, ka_work);
1162         bool comp_seen = ctrl->comp_seen;
1163         struct request *rq;
1164
1165         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1166                 dev_dbg(ctrl->device,
1167                         "reschedule traffic based keep-alive timer\n");
1168                 ctrl->comp_seen = false;
1169                 nvme_queue_keep_alive_work(ctrl);
1170                 return;
1171         }
1172
1173         rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
1174                                   BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1175         if (IS_ERR(rq)) {
1176                 /* allocation failure, reset the controller */
1177                 dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1178                 nvme_reset_ctrl(ctrl);
1179                 return;
1180         }
1181         nvme_init_request(rq, &ctrl->ka_cmd);
1182
1183         rq->timeout = ctrl->kato * HZ;
1184         rq->end_io_data = ctrl;
1185         blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io);
1186 }
1187
1188 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1189 {
1190         if (unlikely(ctrl->kato == 0))
1191                 return;
1192
1193         nvme_queue_keep_alive_work(ctrl);
1194 }
1195
1196 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1197 {
1198         if (unlikely(ctrl->kato == 0))
1199                 return;
1200
1201         cancel_delayed_work_sync(&ctrl->ka_work);
1202 }
1203 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1204
1205 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1206                                    struct nvme_command *cmd)
1207 {
1208         unsigned int new_kato =
1209                 DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
1210
1211         dev_info(ctrl->device,
1212                  "keep alive interval updated from %u ms to %u ms\n",
1213                  ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
1214
1215         nvme_stop_keep_alive(ctrl);
1216         ctrl->kato = new_kato;
1217         nvme_start_keep_alive(ctrl);
1218 }
1219
1220 /*
1221  * In NVMe 1.0 the CNS field was just a binary controller or namespace
1222  * flag, thus sending any new CNS opcodes has a big chance of not working.
1223  * Qemu unfortunately had that bug after reporting a 1.1 version compliance
1224  * (but not for any later version).
1225  */
1226 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
1227 {
1228         if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
1229                 return ctrl->vs < NVME_VS(1, 2, 0);
1230         return ctrl->vs < NVME_VS(1, 1, 0);
1231 }
1232
1233 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1234 {
1235         struct nvme_command c = { };
1236         int error;
1237
1238         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1239         c.identify.opcode = nvme_admin_identify;
1240         c.identify.cns = NVME_ID_CNS_CTRL;
1241
1242         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1243         if (!*id)
1244                 return -ENOMEM;
1245
1246         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1247                         sizeof(struct nvme_id_ctrl));
1248         if (error)
1249                 kfree(*id);
1250         return error;
1251 }
1252
1253 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1254                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1255 {
1256         const char *warn_str = "ctrl returned bogus length:";
1257         void *data = cur;
1258
1259         switch (cur->nidt) {
1260         case NVME_NIDT_EUI64:
1261                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1262                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1263                                  warn_str, cur->nidl);
1264                         return -1;
1265                 }
1266                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1267                 return NVME_NIDT_EUI64_LEN;
1268         case NVME_NIDT_NGUID:
1269                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1270                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1271                                  warn_str, cur->nidl);
1272                         return -1;
1273                 }
1274                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1275                 return NVME_NIDT_NGUID_LEN;
1276         case NVME_NIDT_UUID:
1277                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1278                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1279                                  warn_str, cur->nidl);
1280                         return -1;
1281                 }
1282                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1283                 return NVME_NIDT_UUID_LEN;
1284         case NVME_NIDT_CSI:
1285                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1286                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1287                                  warn_str, cur->nidl);
1288                         return -1;
1289                 }
1290                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1291                 *csi_seen = true;
1292                 return NVME_NIDT_CSI_LEN;
1293         default:
1294                 /* Skip unknown types */
1295                 return cur->nidl;
1296         }
1297 }
1298
1299 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
1300                 struct nvme_ns_ids *ids)
1301 {
1302         struct nvme_command c = { };
1303         bool csi_seen = false;
1304         int status, pos, len;
1305         void *data;
1306
1307         if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1308                 return 0;
1309         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1310                 return 0;
1311
1312         c.identify.opcode = nvme_admin_identify;
1313         c.identify.nsid = cpu_to_le32(nsid);
1314         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1315
1316         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1317         if (!data)
1318                 return -ENOMEM;
1319
1320         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1321                                       NVME_IDENTIFY_DATA_SIZE);
1322         if (status) {
1323                 dev_warn(ctrl->device,
1324                         "Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1325                         nsid, status);
1326                 goto free_data;
1327         }
1328
1329         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1330                 struct nvme_ns_id_desc *cur = data + pos;
1331
1332                 if (cur->nidl == 0)
1333                         break;
1334
1335                 len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
1336                 if (len < 0)
1337                         break;
1338
1339                 len += sizeof(*cur);
1340         }
1341
1342         if (nvme_multi_css(ctrl) && !csi_seen) {
1343                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1344                          nsid);
1345                 status = -EINVAL;
1346         }
1347
1348 free_data:
1349         kfree(data);
1350         return status;
1351 }
1352
1353 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1354                         struct nvme_ns_ids *ids, struct nvme_id_ns **id)
1355 {
1356         struct nvme_command c = { };
1357         int error;
1358
1359         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1360         c.identify.opcode = nvme_admin_identify;
1361         c.identify.nsid = cpu_to_le32(nsid);
1362         c.identify.cns = NVME_ID_CNS_NS;
1363
1364         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1365         if (!*id)
1366                 return -ENOMEM;
1367
1368         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1369         if (error) {
1370                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1371                 goto out_free_id;
1372         }
1373
1374         error = NVME_SC_INVALID_NS | NVME_SC_DNR;
1375         if ((*id)->ncap == 0) /* namespace not allocated or attached */
1376                 goto out_free_id;
1377
1378         if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1379             !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1380                 memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
1381         if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1382             !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1383                 memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
1384
1385         return 0;
1386
1387 out_free_id:
1388         kfree(*id);
1389         return error;
1390 }
1391
1392 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1393                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1394 {
1395         union nvme_result res = { 0 };
1396         struct nvme_command c = { };
1397         int ret;
1398
1399         c.features.opcode = op;
1400         c.features.fid = cpu_to_le32(fid);
1401         c.features.dword11 = cpu_to_le32(dword11);
1402
1403         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1404                         buffer, buflen, 0, NVME_QID_ANY, 0, 0);
1405         if (ret >= 0 && result)
1406                 *result = le32_to_cpu(res.u32);
1407         return ret;
1408 }
1409
1410 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1411                       unsigned int dword11, void *buffer, size_t buflen,
1412                       u32 *result)
1413 {
1414         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1415                              buflen, result);
1416 }
1417 EXPORT_SYMBOL_GPL(nvme_set_features);
1418
1419 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1420                       unsigned int dword11, void *buffer, size_t buflen,
1421                       u32 *result)
1422 {
1423         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1424                              buflen, result);
1425 }
1426 EXPORT_SYMBOL_GPL(nvme_get_features);
1427
1428 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1429 {
1430         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1431         u32 result;
1432         int status, nr_io_queues;
1433
1434         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1435                         &result);
1436         if (status < 0)
1437                 return status;
1438
1439         /*
1440          * Degraded controllers might return an error when setting the queue
1441          * count.  We still want to be able to bring them online and offer
1442          * access to the admin queue, as that might be only way to fix them up.
1443          */
1444         if (status > 0) {
1445                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1446                 *count = 0;
1447         } else {
1448                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1449                 *count = min(*count, nr_io_queues);
1450         }
1451
1452         return 0;
1453 }
1454 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1455
1456 #define NVME_AEN_SUPPORTED \
1457         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1458          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1459
1460 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1461 {
1462         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1463         int status;
1464
1465         if (!supported_aens)
1466                 return;
1467
1468         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1469                         NULL, 0, &result);
1470         if (status)
1471                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1472                          supported_aens);
1473
1474         queue_work(nvme_wq, &ctrl->async_event_work);
1475 }
1476
1477 static int nvme_ns_open(struct nvme_ns *ns)
1478 {
1479
1480         /* should never be called due to GENHD_FL_HIDDEN */
1481         if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1482                 goto fail;
1483         if (!nvme_get_ns(ns))
1484                 goto fail;
1485         if (!try_module_get(ns->ctrl->ops->module))
1486                 goto fail_put_ns;
1487
1488         return 0;
1489
1490 fail_put_ns:
1491         nvme_put_ns(ns);
1492 fail:
1493         return -ENXIO;
1494 }
1495
1496 static void nvme_ns_release(struct nvme_ns *ns)
1497 {
1498
1499         module_put(ns->ctrl->ops->module);
1500         nvme_put_ns(ns);
1501 }
1502
1503 static int nvme_open(struct block_device *bdev, fmode_t mode)
1504 {
1505         return nvme_ns_open(bdev->bd_disk->private_data);
1506 }
1507
1508 static void nvme_release(struct gendisk *disk, fmode_t mode)
1509 {
1510         nvme_ns_release(disk->private_data);
1511 }
1512
1513 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1514 {
1515         /* some standard values */
1516         geo->heads = 1 << 6;
1517         geo->sectors = 1 << 5;
1518         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1519         return 0;
1520 }
1521
1522 #ifdef CONFIG_BLK_DEV_INTEGRITY
1523 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1524                                 u32 max_integrity_segments)
1525 {
1526         struct blk_integrity integrity = { };
1527
1528         switch (pi_type) {
1529         case NVME_NS_DPS_PI_TYPE3:
1530                 integrity.profile = &t10_pi_type3_crc;
1531                 integrity.tag_size = sizeof(u16) + sizeof(u32);
1532                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1533                 break;
1534         case NVME_NS_DPS_PI_TYPE1:
1535         case NVME_NS_DPS_PI_TYPE2:
1536                 integrity.profile = &t10_pi_type1_crc;
1537                 integrity.tag_size = sizeof(u16);
1538                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1539                 break;
1540         default:
1541                 integrity.profile = NULL;
1542                 break;
1543         }
1544         integrity.tuple_size = ms;
1545         blk_integrity_register(disk, &integrity);
1546         blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
1547 }
1548 #else
1549 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1550                                 u32 max_integrity_segments)
1551 {
1552 }
1553 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1554
1555 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
1556 {
1557         struct nvme_ctrl *ctrl = ns->ctrl;
1558         struct request_queue *queue = disk->queue;
1559         u32 size = queue_logical_block_size(queue);
1560
1561         if (ctrl->max_discard_sectors == 0) {
1562                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
1563                 return;
1564         }
1565
1566         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1567                         NVME_DSM_MAX_RANGES);
1568
1569         queue->limits.discard_alignment = 0;
1570         queue->limits.discard_granularity = size;
1571
1572         /* If discard is already enabled, don't reset queue limits */
1573         if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
1574                 return;
1575
1576         blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
1577         blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
1578
1579         if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1580                 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1581 }
1582
1583 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1584 {
1585         return uuid_equal(&a->uuid, &b->uuid) &&
1586                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1587                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1588                 a->csi == b->csi;
1589 }
1590
1591 static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
1592 {
1593         struct nvme_ctrl *ctrl = ns->ctrl;
1594
1595         /*
1596          * The PI implementation requires the metadata size to be equal to the
1597          * t10 pi tuple size.
1598          */
1599         ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1600         if (ns->ms == sizeof(struct t10_pi_tuple))
1601                 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1602         else
1603                 ns->pi_type = 0;
1604
1605         ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1606         if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1607                 return;
1608
1609         if (ctrl->ops->flags & NVME_F_FABRICS) {
1610                 /*
1611                  * The NVMe over Fabrics specification only supports metadata as
1612                  * part of the extended data LBA.  We rely on HCA/HBA support to
1613                  * remap the separate metadata buffer from the block layer.
1614                  */
1615                 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1616                         return;
1617
1618                 ns->features |= NVME_NS_EXT_LBAS;
1619
1620                 /*
1621                  * The current fabrics transport drivers support namespace
1622                  * metadata formats only if nvme_ns_has_pi() returns true.
1623                  * Suppress support for all other formats so the namespace will
1624                  * have a 0 capacity and not be usable through the block stack.
1625                  *
1626                  * Note, this check will need to be modified if any drivers
1627                  * gain the ability to use other metadata formats.
1628                  */
1629                 if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
1630                         ns->features |= NVME_NS_METADATA_SUPPORTED;
1631         } else {
1632                 /*
1633                  * For PCIe controllers, we can't easily remap the separate
1634                  * metadata buffer from the block layer and thus require a
1635                  * separate metadata buffer for block layer metadata/PI support.
1636                  * We allow extended LBAs for the passthrough interface, though.
1637                  */
1638                 if (id->flbas & NVME_NS_FLBAS_META_EXT)
1639                         ns->features |= NVME_NS_EXT_LBAS;
1640                 else
1641                         ns->features |= NVME_NS_METADATA_SUPPORTED;
1642         }
1643 }
1644
1645 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1646                 struct request_queue *q)
1647 {
1648         bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
1649
1650         if (ctrl->max_hw_sectors) {
1651                 u32 max_segments =
1652                         (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
1653
1654                 max_segments = min_not_zero(max_segments, ctrl->max_segments);
1655                 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1656                 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1657         }
1658         blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
1659         blk_queue_dma_alignment(q, 7);
1660         blk_queue_write_cache(q, vwc, vwc);
1661 }
1662
1663 static void nvme_update_disk_info(struct gendisk *disk,
1664                 struct nvme_ns *ns, struct nvme_id_ns *id)
1665 {
1666         sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
1667         unsigned short bs = 1 << ns->lba_shift;
1668         u32 atomic_bs, phys_bs, io_opt = 0;
1669
1670         /*
1671          * The block layer can't support LBA sizes larger than the page size
1672          * yet, so catch this early and don't allow block I/O.
1673          */
1674         if (ns->lba_shift > PAGE_SHIFT) {
1675                 capacity = 0;
1676                 bs = (1 << 9);
1677         }
1678
1679         blk_integrity_unregister(disk);
1680
1681         atomic_bs = phys_bs = bs;
1682         if (id->nabo == 0) {
1683                 /*
1684                  * Bit 1 indicates whether NAWUPF is defined for this namespace
1685                  * and whether it should be used instead of AWUPF. If NAWUPF ==
1686                  * 0 then AWUPF must be used instead.
1687                  */
1688                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
1689                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
1690                 else
1691                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
1692         }
1693
1694         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
1695                 /* NPWG = Namespace Preferred Write Granularity */
1696                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
1697                 /* NOWS = Namespace Optimal Write Size */
1698                 io_opt = bs * (1 + le16_to_cpu(id->nows));
1699         }
1700
1701         blk_queue_logical_block_size(disk->queue, bs);
1702         /*
1703          * Linux filesystems assume writing a single physical block is
1704          * an atomic operation. Hence limit the physical block size to the
1705          * value of the Atomic Write Unit Power Fail parameter.
1706          */
1707         blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
1708         blk_queue_io_min(disk->queue, phys_bs);
1709         blk_queue_io_opt(disk->queue, io_opt);
1710
1711         /*
1712          * Register a metadata profile for PI, or the plain non-integrity NVMe
1713          * metadata masquerading as Type 0 if supported, otherwise reject block
1714          * I/O to namespaces with metadata except when the namespace supports
1715          * PI, as it can strip/insert in that case.
1716          */
1717         if (ns->ms) {
1718                 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
1719                     (ns->features & NVME_NS_METADATA_SUPPORTED))
1720                         nvme_init_integrity(disk, ns->ms, ns->pi_type,
1721                                             ns->ctrl->max_integrity_segments);
1722                 else if (!nvme_ns_has_pi(ns))
1723                         capacity = 0;
1724         }
1725
1726         set_capacity_and_notify(disk, capacity);
1727
1728         nvme_config_discard(disk, ns);
1729         blk_queue_max_write_zeroes_sectors(disk->queue,
1730                                            ns->ctrl->max_zeroes_sectors);
1731
1732         set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||
1733                 test_bit(NVME_NS_FORCE_RO, &ns->flags));
1734 }
1735
1736 static inline bool nvme_first_scan(struct gendisk *disk)
1737 {
1738         /* nvme_alloc_ns() scans the disk prior to adding it */
1739         return !disk_live(disk);
1740 }
1741
1742 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
1743 {
1744         struct nvme_ctrl *ctrl = ns->ctrl;
1745         u32 iob;
1746
1747         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1748             is_power_of_2(ctrl->max_hw_sectors))
1749                 iob = ctrl->max_hw_sectors;
1750         else
1751                 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
1752
1753         if (!iob)
1754                 return;
1755
1756         if (!is_power_of_2(iob)) {
1757                 if (nvme_first_scan(ns->disk))
1758                         pr_warn("%s: ignoring unaligned IO boundary:%u\n",
1759                                 ns->disk->disk_name, iob);
1760                 return;
1761         }
1762
1763         if (blk_queue_is_zoned(ns->disk->queue)) {
1764                 if (nvme_first_scan(ns->disk))
1765                         pr_warn("%s: ignoring zoned namespace IO boundary\n",
1766                                 ns->disk->disk_name);
1767                 return;
1768         }
1769
1770         blk_queue_chunk_sectors(ns->queue, iob);
1771 }
1772
1773 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
1774 {
1775         unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
1776         int ret;
1777
1778         blk_mq_freeze_queue(ns->disk->queue);
1779         ns->lba_shift = id->lbaf[lbaf].ds;
1780         nvme_set_queue_limits(ns->ctrl, ns->queue);
1781
1782         nvme_configure_metadata(ns, id);
1783         nvme_set_chunk_sectors(ns, id);
1784         nvme_update_disk_info(ns->disk, ns, id);
1785
1786         if (ns->head->ids.csi == NVME_CSI_ZNS) {
1787                 ret = nvme_update_zone_info(ns, lbaf);
1788                 if (ret)
1789                         goto out_unfreeze;
1790         }
1791
1792         set_bit(NVME_NS_READY, &ns->flags);
1793         blk_mq_unfreeze_queue(ns->disk->queue);
1794
1795         if (blk_queue_is_zoned(ns->queue)) {
1796                 ret = nvme_revalidate_zones(ns);
1797                 if (ret && !nvme_first_scan(ns->disk))
1798                         return ret;
1799         }
1800
1801         if (nvme_ns_head_multipath(ns->head)) {
1802                 blk_mq_freeze_queue(ns->head->disk->queue);
1803                 nvme_update_disk_info(ns->head->disk, ns, id);
1804                 nvme_mpath_revalidate_paths(ns);
1805                 blk_stack_limits(&ns->head->disk->queue->limits,
1806                                  &ns->queue->limits, 0);
1807                 disk_update_readahead(ns->head->disk);
1808                 blk_mq_unfreeze_queue(ns->head->disk->queue);
1809         }
1810         return 0;
1811
1812 out_unfreeze:
1813         /*
1814          * If probing fails due an unsupported feature, hide the block device,
1815          * but still allow other access.
1816          */
1817         if (ret == -ENODEV) {
1818                 ns->disk->flags |= GENHD_FL_HIDDEN;
1819                 set_bit(NVME_NS_READY, &ns->flags);
1820                 ret = 0;
1821         }
1822         blk_mq_unfreeze_queue(ns->disk->queue);
1823         return ret;
1824 }
1825
1826 static char nvme_pr_type(enum pr_type type)
1827 {
1828         switch (type) {
1829         case PR_WRITE_EXCLUSIVE:
1830                 return 1;
1831         case PR_EXCLUSIVE_ACCESS:
1832                 return 2;
1833         case PR_WRITE_EXCLUSIVE_REG_ONLY:
1834                 return 3;
1835         case PR_EXCLUSIVE_ACCESS_REG_ONLY:
1836                 return 4;
1837         case PR_WRITE_EXCLUSIVE_ALL_REGS:
1838                 return 5;
1839         case PR_EXCLUSIVE_ACCESS_ALL_REGS:
1840                 return 6;
1841         default:
1842                 return 0;
1843         }
1844 }
1845
1846 static int nvme_send_ns_head_pr_command(struct block_device *bdev,
1847                 struct nvme_command *c, u8 data[16])
1848 {
1849         struct nvme_ns_head *head = bdev->bd_disk->private_data;
1850         int srcu_idx = srcu_read_lock(&head->srcu);
1851         struct nvme_ns *ns = nvme_find_path(head);
1852         int ret = -EWOULDBLOCK;
1853
1854         if (ns) {
1855                 c->common.nsid = cpu_to_le32(ns->head->ns_id);
1856                 ret = nvme_submit_sync_cmd(ns->queue, c, data, 16);
1857         }
1858         srcu_read_unlock(&head->srcu, srcu_idx);
1859         return ret;
1860 }
1861         
1862 static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
1863                 u8 data[16])
1864 {
1865         c->common.nsid = cpu_to_le32(ns->head->ns_id);
1866         return nvme_submit_sync_cmd(ns->queue, c, data, 16);
1867 }
1868
1869 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1870                                 u64 key, u64 sa_key, u8 op)
1871 {
1872         struct nvme_command c = { };
1873         u8 data[16] = { 0, };
1874
1875         put_unaligned_le64(key, &data[0]);
1876         put_unaligned_le64(sa_key, &data[8]);
1877
1878         c.common.opcode = op;
1879         c.common.cdw10 = cpu_to_le32(cdw10);
1880
1881         if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
1882             bdev->bd_disk->fops == &nvme_ns_head_ops)
1883                 return nvme_send_ns_head_pr_command(bdev, &c, data);
1884         return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, data);
1885 }
1886
1887 static int nvme_pr_register(struct block_device *bdev, u64 old,
1888                 u64 new, unsigned flags)
1889 {
1890         u32 cdw10;
1891
1892         if (flags & ~PR_FL_IGNORE_KEY)
1893                 return -EOPNOTSUPP;
1894
1895         cdw10 = old ? 2 : 0;
1896         cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
1897         cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
1898         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
1899 }
1900
1901 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
1902                 enum pr_type type, unsigned flags)
1903 {
1904         u32 cdw10;
1905
1906         if (flags & ~PR_FL_IGNORE_KEY)
1907                 return -EOPNOTSUPP;
1908
1909         cdw10 = nvme_pr_type(type) << 8;
1910         cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
1911         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
1912 }
1913
1914 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
1915                 enum pr_type type, bool abort)
1916 {
1917         u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
1918
1919         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
1920 }
1921
1922 static int nvme_pr_clear(struct block_device *bdev, u64 key)
1923 {
1924         u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1925
1926         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
1927 }
1928
1929 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
1930 {
1931         u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
1932
1933         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
1934 }
1935
1936 const struct pr_ops nvme_pr_ops = {
1937         .pr_register    = nvme_pr_register,
1938         .pr_reserve     = nvme_pr_reserve,
1939         .pr_release     = nvme_pr_release,
1940         .pr_preempt     = nvme_pr_preempt,
1941         .pr_clear       = nvme_pr_clear,
1942 };
1943
1944 #ifdef CONFIG_BLK_SED_OPAL
1945 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
1946                 bool send)
1947 {
1948         struct nvme_ctrl *ctrl = data;
1949         struct nvme_command cmd = { };
1950
1951         if (send)
1952                 cmd.common.opcode = nvme_admin_security_send;
1953         else
1954                 cmd.common.opcode = nvme_admin_security_recv;
1955         cmd.common.nsid = 0;
1956         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1957         cmd.common.cdw11 = cpu_to_le32(len);
1958
1959         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
1960                         NVME_QID_ANY, 1, 0);
1961 }
1962 EXPORT_SYMBOL_GPL(nvme_sec_submit);
1963 #endif /* CONFIG_BLK_SED_OPAL */
1964
1965 #ifdef CONFIG_BLK_DEV_ZONED
1966 static int nvme_report_zones(struct gendisk *disk, sector_t sector,
1967                 unsigned int nr_zones, report_zones_cb cb, void *data)
1968 {
1969         return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
1970                         data);
1971 }
1972 #else
1973 #define nvme_report_zones       NULL
1974 #endif /* CONFIG_BLK_DEV_ZONED */
1975
1976 static const struct block_device_operations nvme_bdev_ops = {
1977         .owner          = THIS_MODULE,
1978         .ioctl          = nvme_ioctl,
1979         .open           = nvme_open,
1980         .release        = nvme_release,
1981         .getgeo         = nvme_getgeo,
1982         .report_zones   = nvme_report_zones,
1983         .pr_ops         = &nvme_pr_ops,
1984 };
1985
1986 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1987 {
1988         unsigned long timeout =
1989                 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1990         u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
1991         int ret;
1992
1993         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1994                 if (csts == ~0)
1995                         return -ENODEV;
1996                 if ((csts & NVME_CSTS_RDY) == bit)
1997                         break;
1998
1999                 usleep_range(1000, 2000);
2000                 if (fatal_signal_pending(current))
2001                         return -EINTR;
2002                 if (time_after(jiffies, timeout)) {
2003                         dev_err(ctrl->device,
2004                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2005                                 enabled ? "initialisation" : "reset", csts);
2006                         return -ENODEV;
2007                 }
2008         }
2009
2010         return ret;
2011 }
2012
2013 /*
2014  * If the device has been passed off to us in an enabled state, just clear
2015  * the enabled bit.  The spec says we should set the 'shutdown notification
2016  * bits', but doing so may cause the device to complete commands to the
2017  * admin queue ... and we don't know what memory that might be pointing at!
2018  */
2019 int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
2020 {
2021         int ret;
2022
2023         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2024         ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2025
2026         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2027         if (ret)
2028                 return ret;
2029
2030         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2031                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2032
2033         return nvme_wait_ready(ctrl, ctrl->cap, false);
2034 }
2035 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2036
2037 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2038 {
2039         unsigned dev_page_min;
2040         int ret;
2041
2042         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2043         if (ret) {
2044                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2045                 return ret;
2046         }
2047         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2048
2049         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2050                 dev_err(ctrl->device,
2051                         "Minimum device page size %u too large for host (%u)\n",
2052                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2053                 return -ENODEV;
2054         }
2055
2056         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2057                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2058         else
2059                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2060         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2061         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2062         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2063         ctrl->ctrl_config |= NVME_CC_ENABLE;
2064
2065         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2066         if (ret)
2067                 return ret;
2068         return nvme_wait_ready(ctrl, ctrl->cap, true);
2069 }
2070 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2071
2072 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
2073 {
2074         unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
2075         u32 csts;
2076         int ret;
2077
2078         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2079         ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2080
2081         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2082         if (ret)
2083                 return ret;
2084
2085         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2086                 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
2087                         break;
2088
2089                 msleep(100);
2090                 if (fatal_signal_pending(current))
2091                         return -EINTR;
2092                 if (time_after(jiffies, timeout)) {
2093                         dev_err(ctrl->device,
2094                                 "Device shutdown incomplete; abort shutdown\n");
2095                         return -ENODEV;
2096                 }
2097         }
2098
2099         return ret;
2100 }
2101 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
2102
2103 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2104 {
2105         __le64 ts;
2106         int ret;
2107
2108         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2109                 return 0;
2110
2111         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2112         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2113                         NULL);
2114         if (ret)
2115                 dev_warn_once(ctrl->device,
2116                         "could not set timestamp (%d)\n", ret);
2117         return ret;
2118 }
2119
2120 static int nvme_configure_acre(struct nvme_ctrl *ctrl)
2121 {
2122         struct nvme_feat_host_behavior *host;
2123         int ret;
2124
2125         /* Don't bother enabling the feature if retry delay is not reported */
2126         if (!ctrl->crdt[0])
2127                 return 0;
2128
2129         host = kzalloc(sizeof(*host), GFP_KERNEL);
2130         if (!host)
2131                 return 0;
2132
2133         host->acre = NVME_ENABLE_ACRE;
2134         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2135                                 host, sizeof(*host), NULL);
2136         kfree(host);
2137         return ret;
2138 }
2139
2140 /*
2141  * The function checks whether the given total (exlat + enlat) latency of
2142  * a power state allows the latter to be used as an APST transition target.
2143  * It does so by comparing the latency to the primary and secondary latency
2144  * tolerances defined by module params. If there's a match, the corresponding
2145  * timeout value is returned and the matching tolerance index (1 or 2) is
2146  * reported.
2147  */
2148 static bool nvme_apst_get_transition_time(u64 total_latency,
2149                 u64 *transition_time, unsigned *last_index)
2150 {
2151         if (total_latency <= apst_primary_latency_tol_us) {
2152                 if (*last_index == 1)
2153                         return false;
2154                 *last_index = 1;
2155                 *transition_time = apst_primary_timeout_ms;
2156                 return true;
2157         }
2158         if (apst_secondary_timeout_ms &&
2159                 total_latency <= apst_secondary_latency_tol_us) {
2160                 if (*last_index <= 2)
2161                         return false;
2162                 *last_index = 2;
2163                 *transition_time = apst_secondary_timeout_ms;
2164                 return true;
2165         }
2166         return false;
2167 }
2168
2169 /*
2170  * APST (Autonomous Power State Transition) lets us program a table of power
2171  * state transitions that the controller will perform automatically.
2172  *
2173  * Depending on module params, one of the two supported techniques will be used:
2174  *
2175  * - If the parameters provide explicit timeouts and tolerances, they will be
2176  *   used to build a table with up to 2 non-operational states to transition to.
2177  *   The default parameter values were selected based on the values used by
2178  *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2179  *   regeneration of the APST table in the event of switching between external
2180  *   and battery power, the timeouts and tolerances reflect a compromise
2181  *   between values used by Microsoft for AC and battery scenarios.
2182  * - If not, we'll configure the table with a simple heuristic: we are willing
2183  *   to spend at most 2% of the time transitioning between power states.
2184  *   Therefore, when running in any given state, we will enter the next
2185  *   lower-power non-operational state after waiting 50 * (enlat + exlat)
2186  *   microseconds, as long as that state's exit latency is under the requested
2187  *   maximum latency.
2188  *
2189  * We will not autonomously enter any non-operational state for which the total
2190  * latency exceeds ps_max_latency_us.
2191  *
2192  * Users can set ps_max_latency_us to zero to turn off APST.
2193  */
2194 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2195 {
2196         struct nvme_feat_auto_pst *table;
2197         unsigned apste = 0;
2198         u64 max_lat_us = 0;
2199         __le64 target = 0;
2200         int max_ps = -1;
2201         int state;
2202         int ret;
2203         unsigned last_lt_index = UINT_MAX;
2204
2205         /*
2206          * If APST isn't supported or if we haven't been initialized yet,
2207          * then don't do anything.
2208          */
2209         if (!ctrl->apsta)
2210                 return 0;
2211
2212         if (ctrl->npss > 31) {
2213                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2214                 return 0;
2215         }
2216
2217         table = kzalloc(sizeof(*table), GFP_KERNEL);
2218         if (!table)
2219                 return 0;
2220
2221         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2222                 /* Turn off APST. */
2223                 dev_dbg(ctrl->device, "APST disabled\n");
2224                 goto done;
2225         }
2226
2227         /*
2228          * Walk through all states from lowest- to highest-power.
2229          * According to the spec, lower-numbered states use more power.  NPSS,
2230          * despite the name, is the index of the lowest-power state, not the
2231          * number of states.
2232          */
2233         for (state = (int)ctrl->npss; state >= 0; state--) {
2234                 u64 total_latency_us, exit_latency_us, transition_ms;
2235
2236                 if (target)
2237                         table->entries[state] = target;
2238
2239                 /*
2240                  * Don't allow transitions to the deepest state if it's quirked
2241                  * off.
2242                  */
2243                 if (state == ctrl->npss &&
2244                     (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2245                         continue;
2246
2247                 /*
2248                  * Is this state a useful non-operational state for higher-power
2249                  * states to autonomously transition to?
2250                  */
2251                 if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2252                         continue;
2253
2254                 exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2255                 if (exit_latency_us > ctrl->ps_max_latency_us)
2256                         continue;
2257
2258                 total_latency_us = exit_latency_us +
2259                         le32_to_cpu(ctrl->psd[state].entry_lat);
2260
2261                 /*
2262                  * This state is good. It can be used as the APST idle target
2263                  * for higher power states.
2264                  */
2265                 if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2266                         if (!nvme_apst_get_transition_time(total_latency_us,
2267                                         &transition_ms, &last_lt_index))
2268                                 continue;
2269                 } else {
2270                         transition_ms = total_latency_us + 19;
2271                         do_div(transition_ms, 20);
2272                         if (transition_ms > (1 << 24) - 1)
2273                                 transition_ms = (1 << 24) - 1;
2274                 }
2275
2276                 target = cpu_to_le64((state << 3) | (transition_ms << 8));
2277                 if (max_ps == -1)
2278                         max_ps = state;
2279                 if (total_latency_us > max_lat_us)
2280                         max_lat_us = total_latency_us;
2281         }
2282
2283         if (max_ps == -1)
2284                 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2285         else
2286                 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2287                         max_ps, max_lat_us, (int)sizeof(*table), table);
2288         apste = 1;
2289
2290 done:
2291         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2292                                 table, sizeof(*table), NULL);
2293         if (ret)
2294                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2295         kfree(table);
2296         return ret;
2297 }
2298
2299 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2300 {
2301         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2302         u64 latency;
2303
2304         switch (val) {
2305         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2306         case PM_QOS_LATENCY_ANY:
2307                 latency = U64_MAX;
2308                 break;
2309
2310         default:
2311                 latency = val;
2312         }
2313
2314         if (ctrl->ps_max_latency_us != latency) {
2315                 ctrl->ps_max_latency_us = latency;
2316                 if (ctrl->state == NVME_CTRL_LIVE)
2317                         nvme_configure_apst(ctrl);
2318         }
2319 }
2320
2321 struct nvme_core_quirk_entry {
2322         /*
2323          * NVMe model and firmware strings are padded with spaces.  For
2324          * simplicity, strings in the quirk table are padded with NULLs
2325          * instead.
2326          */
2327         u16 vid;
2328         const char *mn;
2329         const char *fr;
2330         unsigned long quirks;
2331 };
2332
2333 static const struct nvme_core_quirk_entry core_quirks[] = {
2334         {
2335                 /*
2336                  * This Toshiba device seems to die using any APST states.  See:
2337                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2338                  */
2339                 .vid = 0x1179,
2340                 .mn = "THNSF5256GPUK TOSHIBA",
2341                 .quirks = NVME_QUIRK_NO_APST,
2342         },
2343         {
2344                 /*
2345                  * This LiteON CL1-3D*-Q11 firmware version has a race
2346                  * condition associated with actions related to suspend to idle
2347                  * LiteON has resolved the problem in future firmware
2348                  */
2349                 .vid = 0x14a4,
2350                 .fr = "22301111",
2351                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2352         },
2353         {
2354                 /*
2355                  * This Kioxia CD6-V Series / HPE PE8030 device times out and
2356                  * aborts I/O during any load, but more easily reproducible
2357                  * with discards (fstrim).
2358                  *
2359                  * The device is left in a state where it is also not possible
2360                  * to use "nvme set-feature" to disable APST, but booting with
2361                  * nvme_core.default_ps_max_latency=0 works.
2362                  */
2363                 .vid = 0x1e0f,
2364                 .mn = "KCD6XVUL6T40",
2365                 .quirks = NVME_QUIRK_NO_APST,
2366         }
2367 };
2368
2369 /* match is null-terminated but idstr is space-padded. */
2370 static bool string_matches(const char *idstr, const char *match, size_t len)
2371 {
2372         size_t matchlen;
2373
2374         if (!match)
2375                 return true;
2376
2377         matchlen = strlen(match);
2378         WARN_ON_ONCE(matchlen > len);
2379
2380         if (memcmp(idstr, match, matchlen))
2381                 return false;
2382
2383         for (; matchlen < len; matchlen++)
2384                 if (idstr[matchlen] != ' ')
2385                         return false;
2386
2387         return true;
2388 }
2389
2390 static bool quirk_matches(const struct nvme_id_ctrl *id,
2391                           const struct nvme_core_quirk_entry *q)
2392 {
2393         return q->vid == le16_to_cpu(id->vid) &&
2394                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2395                 string_matches(id->fr, q->fr, sizeof(id->fr));
2396 }
2397
2398 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2399                 struct nvme_id_ctrl *id)
2400 {
2401         size_t nqnlen;
2402         int off;
2403
2404         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2405                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2406                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2407                         strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2408                         return;
2409                 }
2410
2411                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2412                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2413         }
2414
2415         /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
2416         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2417                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2418                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2419         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2420         off += sizeof(id->sn);
2421         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2422         off += sizeof(id->mn);
2423         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2424 }
2425
2426 static void nvme_release_subsystem(struct device *dev)
2427 {
2428         struct nvme_subsystem *subsys =
2429                 container_of(dev, struct nvme_subsystem, dev);
2430
2431         if (subsys->instance >= 0)
2432                 ida_free(&nvme_instance_ida, subsys->instance);
2433         kfree(subsys);
2434 }
2435
2436 static void nvme_destroy_subsystem(struct kref *ref)
2437 {
2438         struct nvme_subsystem *subsys =
2439                         container_of(ref, struct nvme_subsystem, ref);
2440
2441         mutex_lock(&nvme_subsystems_lock);
2442         list_del(&subsys->entry);
2443         mutex_unlock(&nvme_subsystems_lock);
2444
2445         ida_destroy(&subsys->ns_ida);
2446         device_del(&subsys->dev);
2447         put_device(&subsys->dev);
2448 }
2449
2450 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2451 {
2452         kref_put(&subsys->ref, nvme_destroy_subsystem);
2453 }
2454
2455 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2456 {
2457         struct nvme_subsystem *subsys;
2458
2459         lockdep_assert_held(&nvme_subsystems_lock);
2460
2461         /*
2462          * Fail matches for discovery subsystems. This results
2463          * in each discovery controller bound to a unique subsystem.
2464          * This avoids issues with validating controller values
2465          * that can only be true when there is a single unique subsystem.
2466          * There may be multiple and completely independent entities
2467          * that provide discovery controllers.
2468          */
2469         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2470                 return NULL;
2471
2472         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2473                 if (strcmp(subsys->subnqn, subsysnqn))
2474                         continue;
2475                 if (!kref_get_unless_zero(&subsys->ref))
2476                         continue;
2477                 return subsys;
2478         }
2479
2480         return NULL;
2481 }
2482
2483 #define SUBSYS_ATTR_RO(_name, _mode, _show)                     \
2484         struct device_attribute subsys_attr_##_name = \
2485                 __ATTR(_name, _mode, _show, NULL)
2486
2487 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2488                                     struct device_attribute *attr,
2489                                     char *buf)
2490 {
2491         struct nvme_subsystem *subsys =
2492                 container_of(dev, struct nvme_subsystem, dev);
2493
2494         return sysfs_emit(buf, "%s\n", subsys->subnqn);
2495 }
2496 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2497
2498 static ssize_t nvme_subsys_show_type(struct device *dev,
2499                                     struct device_attribute *attr,
2500                                     char *buf)
2501 {
2502         struct nvme_subsystem *subsys =
2503                 container_of(dev, struct nvme_subsystem, dev);
2504
2505         switch (subsys->subtype) {
2506         case NVME_NQN_DISC:
2507                 return sysfs_emit(buf, "discovery\n");
2508         case NVME_NQN_NVME:
2509                 return sysfs_emit(buf, "nvm\n");
2510         default:
2511                 return sysfs_emit(buf, "reserved\n");
2512         }
2513 }
2514 static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
2515
2516 #define nvme_subsys_show_str_function(field)                            \
2517 static ssize_t subsys_##field##_show(struct device *dev,                \
2518                             struct device_attribute *attr, char *buf)   \
2519 {                                                                       \
2520         struct nvme_subsystem *subsys =                                 \
2521                 container_of(dev, struct nvme_subsystem, dev);          \
2522         return sysfs_emit(buf, "%.*s\n",                                \
2523                            (int)sizeof(subsys->field), subsys->field);  \
2524 }                                                                       \
2525 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2526
2527 nvme_subsys_show_str_function(model);
2528 nvme_subsys_show_str_function(serial);
2529 nvme_subsys_show_str_function(firmware_rev);
2530
2531 static struct attribute *nvme_subsys_attrs[] = {
2532         &subsys_attr_model.attr,
2533         &subsys_attr_serial.attr,
2534         &subsys_attr_firmware_rev.attr,
2535         &subsys_attr_subsysnqn.attr,
2536         &subsys_attr_subsystype.attr,
2537 #ifdef CONFIG_NVME_MULTIPATH
2538         &subsys_attr_iopolicy.attr,
2539 #endif
2540         NULL,
2541 };
2542
2543 static const struct attribute_group nvme_subsys_attrs_group = {
2544         .attrs = nvme_subsys_attrs,
2545 };
2546
2547 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2548         &nvme_subsys_attrs_group,
2549         NULL,
2550 };
2551
2552 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
2553 {
2554         return ctrl->opts && ctrl->opts->discovery_nqn;
2555 }
2556
2557 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2558                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2559 {
2560         struct nvme_ctrl *tmp;
2561
2562         lockdep_assert_held(&nvme_subsystems_lock);
2563
2564         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2565                 if (nvme_state_terminal(tmp))
2566                         continue;
2567
2568                 if (tmp->cntlid == ctrl->cntlid) {
2569                         dev_err(ctrl->device,
2570                                 "Duplicate cntlid %u with %s, subsys %s, rejecting\n",
2571                                 ctrl->cntlid, dev_name(tmp->device),
2572                                 subsys->subnqn);
2573                         return false;
2574                 }
2575
2576                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2577                     nvme_discovery_ctrl(ctrl))
2578                         continue;
2579
2580                 dev_err(ctrl->device,
2581                         "Subsystem does not support multiple controllers\n");
2582                 return false;
2583         }
2584
2585         return true;
2586 }
2587
2588 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2589 {
2590         struct nvme_subsystem *subsys, *found;
2591         int ret;
2592
2593         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2594         if (!subsys)
2595                 return -ENOMEM;
2596
2597         subsys->instance = -1;
2598         mutex_init(&subsys->lock);
2599         kref_init(&subsys->ref);
2600         INIT_LIST_HEAD(&subsys->ctrls);
2601         INIT_LIST_HEAD(&subsys->nsheads);
2602         nvme_init_subnqn(subsys, ctrl, id);
2603         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2604         memcpy(subsys->model, id->mn, sizeof(subsys->model));
2605         memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2606         subsys->vendor_id = le16_to_cpu(id->vid);
2607         subsys->cmic = id->cmic;
2608
2609         /* Versions prior to 1.4 don't necessarily report a valid type */
2610         if (id->cntrltype == NVME_CTRL_DISC ||
2611             !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
2612                 subsys->subtype = NVME_NQN_DISC;
2613         else
2614                 subsys->subtype = NVME_NQN_NVME;
2615
2616         if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
2617                 dev_err(ctrl->device,
2618                         "Subsystem %s is not a discovery controller",
2619                         subsys->subnqn);
2620                 kfree(subsys);
2621                 return -EINVAL;
2622         }
2623         subsys->awupf = le16_to_cpu(id->awupf);
2624         nvme_mpath_default_iopolicy(subsys);
2625
2626         subsys->dev.class = nvme_subsys_class;
2627         subsys->dev.release = nvme_release_subsystem;
2628         subsys->dev.groups = nvme_subsys_attrs_groups;
2629         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
2630         device_initialize(&subsys->dev);
2631
2632         mutex_lock(&nvme_subsystems_lock);
2633         found = __nvme_find_get_subsystem(subsys->subnqn);
2634         if (found) {
2635                 put_device(&subsys->dev);
2636                 subsys = found;
2637
2638                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2639                         ret = -EINVAL;
2640                         goto out_put_subsystem;
2641                 }
2642         } else {
2643                 ret = device_add(&subsys->dev);
2644                 if (ret) {
2645                         dev_err(ctrl->device,
2646                                 "failed to register subsystem device.\n");
2647                         put_device(&subsys->dev);
2648                         goto out_unlock;
2649                 }
2650                 ida_init(&subsys->ns_ida);
2651                 list_add_tail(&subsys->entry, &nvme_subsystems);
2652         }
2653
2654         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2655                                 dev_name(ctrl->device));
2656         if (ret) {
2657                 dev_err(ctrl->device,
2658                         "failed to create sysfs link from subsystem.\n");
2659                 goto out_put_subsystem;
2660         }
2661
2662         if (!found)
2663                 subsys->instance = ctrl->instance;
2664         ctrl->subsys = subsys;
2665         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2666         mutex_unlock(&nvme_subsystems_lock);
2667         return 0;
2668
2669 out_put_subsystem:
2670         nvme_put_subsystem(subsys);
2671 out_unlock:
2672         mutex_unlock(&nvme_subsystems_lock);
2673         return ret;
2674 }
2675
2676 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
2677                 void *log, size_t size, u64 offset)
2678 {
2679         struct nvme_command c = { };
2680         u32 dwlen = nvme_bytes_to_numd(size);
2681
2682         c.get_log_page.opcode = nvme_admin_get_log_page;
2683         c.get_log_page.nsid = cpu_to_le32(nsid);
2684         c.get_log_page.lid = log_page;
2685         c.get_log_page.lsp = lsp;
2686         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
2687         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
2688         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
2689         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
2690         c.get_log_page.csi = csi;
2691
2692         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2693 }
2694
2695 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
2696                                 struct nvme_effects_log **log)
2697 {
2698         struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
2699         int ret;
2700
2701         if (cel)
2702                 goto out;
2703
2704         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
2705         if (!cel)
2706                 return -ENOMEM;
2707
2708         ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
2709                         cel, sizeof(*cel), 0);
2710         if (ret) {
2711                 kfree(cel);
2712                 return ret;
2713         }
2714
2715         xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
2716 out:
2717         *log = cel;
2718         return 0;
2719 }
2720
2721 static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
2722 {
2723         u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
2724
2725         if (check_shl_overflow(1U, units + page_shift - 9, &val))
2726                 return UINT_MAX;
2727         return val;
2728 }
2729
2730 static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
2731 {
2732         struct nvme_command c = { };
2733         struct nvme_id_ctrl_nvm *id;
2734         int ret;
2735
2736         if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
2737                 ctrl->max_discard_sectors = UINT_MAX;
2738                 ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
2739         } else {
2740                 ctrl->max_discard_sectors = 0;
2741                 ctrl->max_discard_segments = 0;
2742         }
2743
2744         /*
2745          * Even though NVMe spec explicitly states that MDTS is not applicable
2746          * to the write-zeroes, we are cautious and limit the size to the
2747          * controllers max_hw_sectors value, which is based on the MDTS field
2748          * and possibly other limiting factors.
2749          */
2750         if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
2751             !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
2752                 ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
2753         else
2754                 ctrl->max_zeroes_sectors = 0;
2755
2756         if (nvme_ctrl_limited_cns(ctrl))
2757                 return 0;
2758
2759         id = kzalloc(sizeof(*id), GFP_KERNEL);
2760         if (!id)
2761                 return 0;
2762
2763         c.identify.opcode = nvme_admin_identify;
2764         c.identify.cns = NVME_ID_CNS_CS_CTRL;
2765         c.identify.csi = NVME_CSI_NVM;
2766
2767         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
2768         if (ret)
2769                 goto free_data;
2770
2771         if (id->dmrl)
2772                 ctrl->max_discard_segments = id->dmrl;
2773         if (id->dmrsl)
2774                 ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl);
2775         if (id->wzsl)
2776                 ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
2777
2778 free_data:
2779         kfree(id);
2780         return ret;
2781 }
2782
2783 static int nvme_init_identify(struct nvme_ctrl *ctrl)
2784 {
2785         struct nvme_id_ctrl *id;
2786         u32 max_hw_sectors;
2787         bool prev_apst_enabled;
2788         int ret;
2789
2790         ret = nvme_identify_ctrl(ctrl, &id);
2791         if (ret) {
2792                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
2793                 return -EIO;
2794         }
2795
2796         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2797                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
2798                 if (ret < 0)
2799                         goto out_free;
2800         }
2801
2802         if (!(ctrl->ops->flags & NVME_F_FABRICS))
2803                 ctrl->cntlid = le16_to_cpu(id->cntlid);
2804
2805         if (!ctrl->identified) {
2806                 unsigned int i;
2807
2808                 ret = nvme_init_subsystem(ctrl, id);
2809                 if (ret)
2810                         goto out_free;
2811
2812                 /*
2813                  * Check for quirks.  Quirk can depend on firmware version,
2814                  * so, in principle, the set of quirks present can change
2815                  * across a reset.  As a possible future enhancement, we
2816                  * could re-scan for quirks every time we reinitialize
2817                  * the device, but we'd have to make sure that the driver
2818                  * behaves intelligently if the quirks change.
2819                  */
2820                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
2821                         if (quirk_matches(id, &core_quirks[i]))
2822                                 ctrl->quirks |= core_quirks[i].quirks;
2823                 }
2824         }
2825
2826         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
2827                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
2828                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
2829         }
2830
2831         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
2832         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
2833         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
2834
2835         ctrl->oacs = le16_to_cpu(id->oacs);
2836         ctrl->oncs = le16_to_cpu(id->oncs);
2837         ctrl->mtfa = le16_to_cpu(id->mtfa);
2838         ctrl->oaes = le32_to_cpu(id->oaes);
2839         ctrl->wctemp = le16_to_cpu(id->wctemp);
2840         ctrl->cctemp = le16_to_cpu(id->cctemp);
2841
2842         atomic_set(&ctrl->abort_limit, id->acl + 1);
2843         ctrl->vwc = id->vwc;
2844         if (id->mdts)
2845                 max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
2846         else
2847                 max_hw_sectors = UINT_MAX;
2848         ctrl->max_hw_sectors =
2849                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
2850
2851         nvme_set_queue_limits(ctrl, ctrl->admin_q);
2852         ctrl->sgls = le32_to_cpu(id->sgls);
2853         ctrl->kas = le16_to_cpu(id->kas);
2854         ctrl->max_namespaces = le32_to_cpu(id->mnan);
2855         ctrl->ctratt = le32_to_cpu(id->ctratt);
2856
2857         ctrl->cntrltype = id->cntrltype;
2858         ctrl->dctype = id->dctype;
2859
2860         if (id->rtd3e) {
2861                 /* us -> s */
2862                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
2863
2864                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
2865                                                  shutdown_timeout, 60);
2866
2867                 if (ctrl->shutdown_timeout != shutdown_timeout)
2868                         dev_info(ctrl->device,
2869                                  "Shutdown timeout set to %u seconds\n",
2870                                  ctrl->shutdown_timeout);
2871         } else
2872                 ctrl->shutdown_timeout = shutdown_timeout;
2873
2874         ctrl->npss = id->npss;
2875         ctrl->apsta = id->apsta;
2876         prev_apst_enabled = ctrl->apst_enabled;
2877         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
2878                 if (force_apst && id->apsta) {
2879                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
2880                         ctrl->apst_enabled = true;
2881                 } else {
2882                         ctrl->apst_enabled = false;
2883                 }
2884         } else {
2885                 ctrl->apst_enabled = id->apsta;
2886         }
2887         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
2888
2889         if (ctrl->ops->flags & NVME_F_FABRICS) {
2890                 ctrl->icdoff = le16_to_cpu(id->icdoff);
2891                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
2892                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
2893                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
2894
2895                 /*
2896                  * In fabrics we need to verify the cntlid matches the
2897                  * admin connect
2898                  */
2899                 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
2900                         dev_err(ctrl->device,
2901                                 "Mismatching cntlid: Connect %u vs Identify "
2902                                 "%u, rejecting\n",
2903                                 ctrl->cntlid, le16_to_cpu(id->cntlid));
2904                         ret = -EINVAL;
2905                         goto out_free;
2906                 }
2907
2908                 if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
2909                         dev_err(ctrl->device,
2910                                 "keep-alive support is mandatory for fabrics\n");
2911                         ret = -EINVAL;
2912                         goto out_free;
2913                 }
2914         } else {
2915                 ctrl->hmpre = le32_to_cpu(id->hmpre);
2916                 ctrl->hmmin = le32_to_cpu(id->hmmin);
2917                 ctrl->hmminds = le32_to_cpu(id->hmminds);
2918                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
2919         }
2920
2921         ret = nvme_mpath_init_identify(ctrl, id);
2922         if (ret < 0)
2923                 goto out_free;
2924
2925         if (ctrl->apst_enabled && !prev_apst_enabled)
2926                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
2927         else if (!ctrl->apst_enabled && prev_apst_enabled)
2928                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
2929
2930 out_free:
2931         kfree(id);
2932         return ret;
2933 }
2934
2935 /*
2936  * Initialize the cached copies of the Identify data and various controller
2937  * register in our nvme_ctrl structure.  This should be called as soon as
2938  * the admin queue is fully up and running.
2939  */
2940 int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
2941 {
2942         int ret;
2943
2944         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
2945         if (ret) {
2946                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
2947                 return ret;
2948         }
2949
2950         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
2951
2952         if (ctrl->vs >= NVME_VS(1, 1, 0))
2953                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
2954
2955         ret = nvme_init_identify(ctrl);
2956         if (ret)
2957                 return ret;
2958
2959         ret = nvme_init_non_mdts_limits(ctrl);
2960         if (ret < 0)
2961                 return ret;
2962
2963         ret = nvme_configure_apst(ctrl);
2964         if (ret < 0)
2965                 return ret;
2966
2967         ret = nvme_configure_timestamp(ctrl);
2968         if (ret < 0)
2969                 return ret;
2970
2971         ret = nvme_configure_acre(ctrl);
2972         if (ret < 0)
2973                 return ret;
2974
2975         if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
2976                 ret = nvme_hwmon_init(ctrl);
2977                 if (ret < 0)
2978                         return ret;
2979         }
2980
2981         ctrl->identified = true;
2982
2983         return 0;
2984 }
2985 EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
2986
2987 static int nvme_dev_open(struct inode *inode, struct file *file)
2988 {
2989         struct nvme_ctrl *ctrl =
2990                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
2991
2992         switch (ctrl->state) {
2993         case NVME_CTRL_LIVE:
2994                 break;
2995         default:
2996                 return -EWOULDBLOCK;
2997         }
2998
2999         nvme_get_ctrl(ctrl);
3000         if (!try_module_get(ctrl->ops->module)) {
3001                 nvme_put_ctrl(ctrl);
3002                 return -EINVAL;
3003         }
3004
3005         file->private_data = ctrl;
3006         return 0;
3007 }
3008
3009 static int nvme_dev_release(struct inode *inode, struct file *file)
3010 {
3011         struct nvme_ctrl *ctrl =
3012                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3013
3014         module_put(ctrl->ops->module);
3015         nvme_put_ctrl(ctrl);
3016         return 0;
3017 }
3018
3019 static const struct file_operations nvme_dev_fops = {
3020         .owner          = THIS_MODULE,
3021         .open           = nvme_dev_open,
3022         .release        = nvme_dev_release,
3023         .unlocked_ioctl = nvme_dev_ioctl,
3024         .compat_ioctl   = compat_ptr_ioctl,
3025 };
3026
3027 static ssize_t nvme_sysfs_reset(struct device *dev,
3028                                 struct device_attribute *attr, const char *buf,
3029                                 size_t count)
3030 {
3031         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3032         int ret;
3033
3034         ret = nvme_reset_ctrl_sync(ctrl);
3035         if (ret < 0)
3036                 return ret;
3037         return count;
3038 }
3039 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
3040
3041 static ssize_t nvme_sysfs_rescan(struct device *dev,
3042                                 struct device_attribute *attr, const char *buf,
3043                                 size_t count)
3044 {
3045         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3046
3047         nvme_queue_scan(ctrl);
3048         return count;
3049 }
3050 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
3051
3052 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
3053 {
3054         struct gendisk *disk = dev_to_disk(dev);
3055
3056         if (disk->fops == &nvme_bdev_ops)
3057                 return nvme_get_ns_from_dev(dev)->head;
3058         else
3059                 return disk->private_data;
3060 }
3061
3062 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
3063                 char *buf)
3064 {
3065         struct nvme_ns_head *head = dev_to_ns_head(dev);
3066         struct nvme_ns_ids *ids = &head->ids;
3067         struct nvme_subsystem *subsys = head->subsys;
3068         int serial_len = sizeof(subsys->serial);
3069         int model_len = sizeof(subsys->model);
3070
3071         if (!uuid_is_null(&ids->uuid))
3072                 return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
3073
3074         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3075                 return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
3076
3077         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3078                 return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
3079
3080         while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
3081                                   subsys->serial[serial_len - 1] == '\0'))
3082                 serial_len--;
3083         while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
3084                                  subsys->model[model_len - 1] == '\0'))
3085                 model_len--;
3086
3087         return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3088                 serial_len, subsys->serial, model_len, subsys->model,
3089                 head->ns_id);
3090 }
3091 static DEVICE_ATTR_RO(wwid);
3092
3093 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
3094                 char *buf)
3095 {
3096         return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3097 }
3098 static DEVICE_ATTR_RO(nguid);
3099
3100 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
3101                 char *buf)
3102 {
3103         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3104
3105         /* For backward compatibility expose the NGUID to userspace if
3106          * we have no UUID set
3107          */
3108         if (uuid_is_null(&ids->uuid)) {
3109                 printk_ratelimited(KERN_WARNING
3110                                    "No UUID available providing old NGUID\n");
3111                 return sysfs_emit(buf, "%pU\n", ids->nguid);
3112         }
3113         return sysfs_emit(buf, "%pU\n", &ids->uuid);
3114 }
3115 static DEVICE_ATTR_RO(uuid);
3116
3117 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
3118                 char *buf)
3119 {
3120         return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3121 }
3122 static DEVICE_ATTR_RO(eui);
3123
3124 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
3125                 char *buf)
3126 {
3127         return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3128 }
3129 static DEVICE_ATTR_RO(nsid);
3130
3131 static struct attribute *nvme_ns_id_attrs[] = {
3132         &dev_attr_wwid.attr,
3133         &dev_attr_uuid.attr,
3134         &dev_attr_nguid.attr,
3135         &dev_attr_eui.attr,
3136         &dev_attr_nsid.attr,
3137 #ifdef CONFIG_NVME_MULTIPATH
3138         &dev_attr_ana_grpid.attr,
3139         &dev_attr_ana_state.attr,
3140 #endif
3141         NULL,
3142 };
3143
3144 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
3145                 struct attribute *a, int n)
3146 {
3147         struct device *dev = container_of(kobj, struct device, kobj);
3148         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3149
3150         if (a == &dev_attr_uuid.attr) {
3151                 if (uuid_is_null(&ids->uuid) &&
3152                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3153                         return 0;
3154         }
3155         if (a == &dev_attr_nguid.attr) {
3156                 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3157                         return 0;
3158         }
3159         if (a == &dev_attr_eui.attr) {
3160                 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3161                         return 0;
3162         }
3163 #ifdef CONFIG_NVME_MULTIPATH
3164         if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
3165                 if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */
3166                         return 0;
3167                 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
3168                         return 0;
3169         }
3170 #endif
3171         return a->mode;
3172 }
3173
3174 static const struct attribute_group nvme_ns_id_attr_group = {
3175         .attrs          = nvme_ns_id_attrs,
3176         .is_visible     = nvme_ns_id_attrs_are_visible,
3177 };
3178
3179 const struct attribute_group *nvme_ns_id_attr_groups[] = {
3180         &nvme_ns_id_attr_group,
3181         NULL,
3182 };
3183
3184 #define nvme_show_str_function(field)                                           \
3185 static ssize_t  field##_show(struct device *dev,                                \
3186                             struct device_attribute *attr, char *buf)           \
3187 {                                                                               \
3188         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3189         return sysfs_emit(buf, "%.*s\n",                                        \
3190                 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field);         \
3191 }                                                                               \
3192 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3193
3194 nvme_show_str_function(model);
3195 nvme_show_str_function(serial);
3196 nvme_show_str_function(firmware_rev);
3197
3198 #define nvme_show_int_function(field)                                           \
3199 static ssize_t  field##_show(struct device *dev,                                \
3200                             struct device_attribute *attr, char *buf)           \
3201 {                                                                               \
3202         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3203         return sysfs_emit(buf, "%d\n", ctrl->field);                            \
3204 }                                                                               \
3205 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3206
3207 nvme_show_int_function(cntlid);
3208 nvme_show_int_function(numa_node);
3209 nvme_show_int_function(queue_count);
3210 nvme_show_int_function(sqsize);
3211 nvme_show_int_function(kato);
3212
3213 static ssize_t nvme_sysfs_delete(struct device *dev,
3214                                 struct device_attribute *attr, const char *buf,
3215                                 size_t count)
3216 {
3217         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3218
3219         if (device_remove_file_self(dev, attr))
3220                 nvme_delete_ctrl_sync(ctrl);
3221         return count;
3222 }
3223 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
3224
3225 static ssize_t nvme_sysfs_show_transport(struct device *dev,
3226                                          struct device_attribute *attr,
3227                                          char *buf)
3228 {
3229         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3230
3231         return sysfs_emit(buf, "%s\n", ctrl->ops->name);
3232 }
3233 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
3234
3235 static ssize_t nvme_sysfs_show_state(struct device *dev,
3236                                      struct device_attribute *attr,
3237                                      char *buf)
3238 {
3239         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3240         static const char *const state_name[] = {
3241                 [NVME_CTRL_NEW]         = "new",
3242                 [NVME_CTRL_LIVE]        = "live",
3243                 [NVME_CTRL_RESETTING]   = "resetting",
3244                 [NVME_CTRL_CONNECTING]  = "connecting",
3245                 [NVME_CTRL_DELETING]    = "deleting",
3246                 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
3247                 [NVME_CTRL_DEAD]        = "dead",
3248         };
3249
3250         if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
3251             state_name[ctrl->state])
3252                 return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
3253
3254         return sysfs_emit(buf, "unknown state\n");
3255 }
3256
3257 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
3258
3259 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
3260                                          struct device_attribute *attr,
3261                                          char *buf)
3262 {
3263         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3264
3265         return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn);
3266 }
3267 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
3268
3269 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3270                                         struct device_attribute *attr,
3271                                         char *buf)
3272 {
3273         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3274
3275         return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn);
3276 }
3277 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3278
3279 static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3280                                         struct device_attribute *attr,
3281                                         char *buf)
3282 {
3283         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3284
3285         return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id);
3286 }
3287 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3288
3289 static ssize_t nvme_sysfs_show_address(struct device *dev,
3290                                          struct device_attribute *attr,
3291                                          char *buf)
3292 {
3293         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3294
3295         return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
3296 }
3297 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3298
3299 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3300                 struct device_attribute *attr, char *buf)
3301 {
3302         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3303         struct nvmf_ctrl_options *opts = ctrl->opts;
3304
3305         if (ctrl->opts->max_reconnects == -1)
3306                 return sysfs_emit(buf, "off\n");
3307         return sysfs_emit(buf, "%d\n",
3308                           opts->max_reconnects * opts->reconnect_delay);
3309 }
3310
3311 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3312                 struct device_attribute *attr, const char *buf, size_t count)
3313 {
3314         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3315         struct nvmf_ctrl_options *opts = ctrl->opts;
3316         int ctrl_loss_tmo, err;
3317
3318         err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3319         if (err)
3320                 return -EINVAL;
3321
3322         if (ctrl_loss_tmo < 0)
3323                 opts->max_reconnects = -1;
3324         else
3325                 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3326                                                 opts->reconnect_delay);
3327         return count;
3328 }
3329 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3330         nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3331
3332 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3333                 struct device_attribute *attr, char *buf)
3334 {
3335         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3336
3337         if (ctrl->opts->reconnect_delay == -1)
3338                 return sysfs_emit(buf, "off\n");
3339         return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
3340 }
3341
3342 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3343                 struct device_attribute *attr, const char *buf, size_t count)
3344 {
3345         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3346         unsigned int v;
3347         int err;
3348
3349         err = kstrtou32(buf, 10, &v);
3350         if (err)
3351                 return err;
3352
3353         ctrl->opts->reconnect_delay = v;
3354         return count;
3355 }
3356 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3357         nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
3358
3359 static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev,
3360                 struct device_attribute *attr, char *buf)
3361 {
3362         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3363
3364         if (ctrl->opts->fast_io_fail_tmo == -1)
3365                 return sysfs_emit(buf, "off\n");
3366         return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo);
3367 }
3368
3369 static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev,
3370                 struct device_attribute *attr, const char *buf, size_t count)
3371 {
3372         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3373         struct nvmf_ctrl_options *opts = ctrl->opts;
3374         int fast_io_fail_tmo, err;
3375
3376         err = kstrtoint(buf, 10, &fast_io_fail_tmo);
3377         if (err)
3378                 return -EINVAL;
3379
3380         if (fast_io_fail_tmo < 0)
3381                 opts->fast_io_fail_tmo = -1;
3382         else
3383                 opts->fast_io_fail_tmo = fast_io_fail_tmo;
3384         return count;
3385 }
3386 static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
3387         nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store);
3388
3389 static ssize_t cntrltype_show(struct device *dev,
3390                               struct device_attribute *attr, char *buf)
3391 {
3392         static const char * const type[] = {
3393                 [NVME_CTRL_IO] = "io\n",
3394                 [NVME_CTRL_DISC] = "discovery\n",
3395                 [NVME_CTRL_ADMIN] = "admin\n",
3396         };
3397         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3398
3399         if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype])
3400                 return sysfs_emit(buf, "reserved\n");
3401
3402         return sysfs_emit(buf, type[ctrl->cntrltype]);
3403 }
3404 static DEVICE_ATTR_RO(cntrltype);
3405
3406 static ssize_t dctype_show(struct device *dev,
3407                            struct device_attribute *attr, char *buf)
3408 {
3409         static const char * const type[] = {
3410                 [NVME_DCTYPE_NOT_REPORTED] = "none\n",
3411                 [NVME_DCTYPE_DDC] = "ddc\n",
3412                 [NVME_DCTYPE_CDC] = "cdc\n",
3413         };
3414         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3415
3416         if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype])
3417                 return sysfs_emit(buf, "reserved\n");
3418
3419         return sysfs_emit(buf, type[ctrl->dctype]);
3420 }
3421 static DEVICE_ATTR_RO(dctype);
3422
3423 static struct attribute *nvme_dev_attrs[] = {
3424         &dev_attr_reset_controller.attr,
3425         &dev_attr_rescan_controller.attr,
3426         &dev_attr_model.attr,
3427         &dev_attr_serial.attr,
3428         &dev_attr_firmware_rev.attr,
3429         &dev_attr_cntlid.attr,
3430         &dev_attr_delete_controller.attr,
3431         &dev_attr_transport.attr,
3432         &dev_attr_subsysnqn.attr,
3433         &dev_attr_address.attr,
3434         &dev_attr_state.attr,
3435         &dev_attr_numa_node.attr,
3436         &dev_attr_queue_count.attr,
3437         &dev_attr_sqsize.attr,
3438         &dev_attr_hostnqn.attr,
3439         &dev_attr_hostid.attr,
3440         &dev_attr_ctrl_loss_tmo.attr,
3441         &dev_attr_reconnect_delay.attr,
3442         &dev_attr_fast_io_fail_tmo.attr,
3443         &dev_attr_kato.attr,
3444         &dev_attr_cntrltype.attr,
3445         &dev_attr_dctype.attr,
3446         NULL
3447 };
3448
3449 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
3450                 struct attribute *a, int n)
3451 {
3452         struct device *dev = container_of(kobj, struct device, kobj);
3453         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3454
3455         if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
3456                 return 0;
3457         if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3458                 return 0;
3459         if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3460                 return 0;
3461         if (a == &dev_attr_hostid.attr && !ctrl->opts)
3462                 return 0;
3463         if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
3464                 return 0;
3465         if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
3466                 return 0;
3467         if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
3468                 return 0;
3469
3470         return a->mode;
3471 }
3472
3473 static const struct attribute_group nvme_dev_attrs_group = {
3474         .attrs          = nvme_dev_attrs,
3475         .is_visible     = nvme_dev_attrs_are_visible,
3476 };
3477
3478 static const struct attribute_group *nvme_dev_attr_groups[] = {
3479         &nvme_dev_attrs_group,
3480         NULL,
3481 };
3482
3483 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
3484                 unsigned nsid)
3485 {
3486         struct nvme_ns_head *h;
3487
3488         lockdep_assert_held(&subsys->lock);
3489
3490         list_for_each_entry(h, &subsys->nsheads, entry) {
3491                 if (h->ns_id != nsid)
3492                         continue;
3493                 if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3494                         return h;
3495         }
3496
3497         return NULL;
3498 }
3499
3500 static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3501                 struct nvme_ns_ids *ids)
3502 {
3503         bool has_uuid = !uuid_is_null(&ids->uuid);
3504         bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
3505         bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3506         struct nvme_ns_head *h;
3507
3508         lockdep_assert_held(&subsys->lock);
3509
3510         list_for_each_entry(h, &subsys->nsheads, entry) {
3511                 if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
3512                         return -EINVAL;
3513                 if (has_nguid &&
3514                     memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
3515                         return -EINVAL;
3516                 if (has_eui64 &&
3517                     memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3518                         return -EINVAL;
3519         }
3520
3521         return 0;
3522 }
3523
3524 static void nvme_cdev_rel(struct device *dev)
3525 {
3526         ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3527 }
3528
3529 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
3530 {
3531         cdev_device_del(cdev, cdev_device);
3532         put_device(cdev_device);
3533 }
3534
3535 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
3536                 const struct file_operations *fops, struct module *owner)
3537 {
3538         int minor, ret;
3539
3540         minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
3541         if (minor < 0)
3542                 return minor;
3543         cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
3544         cdev_device->class = nvme_ns_chr_class;
3545         cdev_device->release = nvme_cdev_rel;
3546         device_initialize(cdev_device);
3547         cdev_init(cdev, fops);
3548         cdev->owner = owner;
3549         ret = cdev_device_add(cdev, cdev_device);
3550         if (ret)
3551                 put_device(cdev_device);
3552
3553         return ret;
3554 }
3555
3556 static int nvme_ns_chr_open(struct inode *inode, struct file *file)
3557 {
3558         return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
3559 }
3560
3561 static int nvme_ns_chr_release(struct inode *inode, struct file *file)
3562 {
3563         nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
3564         return 0;
3565 }
3566
3567 static const struct file_operations nvme_ns_chr_fops = {
3568         .owner          = THIS_MODULE,
3569         .open           = nvme_ns_chr_open,
3570         .release        = nvme_ns_chr_release,
3571         .unlocked_ioctl = nvme_ns_chr_ioctl,
3572         .compat_ioctl   = compat_ptr_ioctl,
3573 };
3574
3575 static int nvme_add_ns_cdev(struct nvme_ns *ns)
3576 {
3577         int ret;
3578
3579         ns->cdev_device.parent = ns->ctrl->device;
3580         ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
3581                            ns->ctrl->instance, ns->head->instance);
3582         if (ret)
3583                 return ret;
3584
3585         return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
3586                              ns->ctrl->ops->module);
3587 }
3588
3589 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3590                 unsigned nsid, struct nvme_ns_ids *ids)
3591 {
3592         struct nvme_ns_head *head;
3593         size_t size = sizeof(*head);
3594         int ret = -ENOMEM;
3595
3596 #ifdef CONFIG_NVME_MULTIPATH
3597         size += num_possible_nodes() * sizeof(struct nvme_ns *);
3598 #endif
3599
3600         head = kzalloc(size, GFP_KERNEL);
3601         if (!head)
3602                 goto out;
3603         ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
3604         if (ret < 0)
3605                 goto out_free_head;
3606         head->instance = ret;
3607         INIT_LIST_HEAD(&head->list);
3608         ret = init_srcu_struct(&head->srcu);
3609         if (ret)
3610                 goto out_ida_remove;
3611         head->subsys = ctrl->subsys;
3612         head->ns_id = nsid;
3613         head->ids = *ids;
3614         kref_init(&head->ref);
3615
3616         if (head->ids.csi) {
3617                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3618                 if (ret)
3619                         goto out_cleanup_srcu;
3620         } else
3621                 head->effects = ctrl->effects;
3622
3623         ret = nvme_mpath_alloc_disk(ctrl, head);
3624         if (ret)
3625                 goto out_cleanup_srcu;
3626
3627         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3628
3629         kref_get(&ctrl->subsys->ref);
3630
3631         return head;
3632 out_cleanup_srcu:
3633         cleanup_srcu_struct(&head->srcu);
3634 out_ida_remove:
3635         ida_free(&ctrl->subsys->ns_ida, head->instance);
3636 out_free_head:
3637         kfree(head);
3638 out:
3639         if (ret > 0)
3640                 ret = blk_status_to_errno(nvme_error_status(ret));
3641         return ERR_PTR(ret);
3642 }
3643
3644 static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
3645                 struct nvme_ns_ids *ids)
3646 {
3647         struct nvme_subsystem *s;
3648         int ret = 0;
3649
3650         /*
3651          * Note that this check is racy as we try to avoid holding the global
3652          * lock over the whole ns_head creation.  But it is only intended as
3653          * a sanity check anyway.
3654          */
3655         mutex_lock(&nvme_subsystems_lock);
3656         list_for_each_entry(s, &nvme_subsystems, entry) {
3657                 if (s == this)
3658                         continue;
3659                 mutex_lock(&s->lock);
3660                 ret = nvme_subsys_check_duplicate_ids(s, ids);
3661                 mutex_unlock(&s->lock);
3662                 if (ret)
3663                         break;
3664         }
3665         mutex_unlock(&nvme_subsystems_lock);
3666
3667         return ret;
3668 }
3669
3670 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
3671                 struct nvme_ns_ids *ids, bool is_shared)
3672 {
3673         struct nvme_ctrl *ctrl = ns->ctrl;
3674         struct nvme_ns_head *head = NULL;
3675         int ret;
3676
3677         ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids);
3678         if (ret) {
3679                 dev_err(ctrl->device,
3680                         "globally duplicate IDs for nsid %d\n", nsid);
3681                 return ret;
3682         }
3683
3684         mutex_lock(&ctrl->subsys->lock);
3685         head = nvme_find_ns_head(ctrl->subsys, nsid);
3686         if (!head) {
3687                 ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
3688                 if (ret) {
3689                         dev_err(ctrl->device,
3690                                 "duplicate IDs in subsystem for nsid %d\n",
3691                                 nsid);
3692                         goto out_unlock;
3693                 }
3694                 head = nvme_alloc_ns_head(ctrl, nsid, ids);
3695                 if (IS_ERR(head)) {
3696                         ret = PTR_ERR(head);
3697                         goto out_unlock;
3698                 }
3699                 head->shared = is_shared;
3700         } else {
3701                 ret = -EINVAL;
3702                 if (!is_shared || !head->shared) {
3703                         dev_err(ctrl->device,
3704                                 "Duplicate unshared namespace %d\n", nsid);
3705                         goto out_put_ns_head;
3706                 }
3707                 if (!nvme_ns_ids_equal(&head->ids, ids)) {
3708                         dev_err(ctrl->device,
3709                                 "IDs don't match for shared namespace %d\n",
3710                                         nsid);
3711                         goto out_put_ns_head;
3712                 }
3713
3714                 if (!multipath && !list_empty(&head->list)) {
3715                         dev_warn(ctrl->device,
3716                                 "Found shared namespace %d, but multipathing not supported.\n",
3717                                 nsid);
3718                         dev_warn_once(ctrl->device,
3719                                 "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
3720                 }
3721         }
3722
3723         list_add_tail_rcu(&ns->siblings, &head->list);
3724         ns->head = head;
3725         mutex_unlock(&ctrl->subsys->lock);
3726         return 0;
3727
3728 out_put_ns_head:
3729         nvme_put_ns_head(head);
3730 out_unlock:
3731         mutex_unlock(&ctrl->subsys->lock);
3732         return ret;
3733 }
3734
3735 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3736 {
3737         struct nvme_ns *ns, *ret = NULL;
3738
3739         down_read(&ctrl->namespaces_rwsem);
3740         list_for_each_entry(ns, &ctrl->namespaces, list) {
3741                 if (ns->head->ns_id == nsid) {
3742                         if (!nvme_get_ns(ns))
3743                                 continue;
3744                         ret = ns;
3745                         break;
3746                 }
3747                 if (ns->head->ns_id > nsid)
3748                         break;
3749         }
3750         up_read(&ctrl->namespaces_rwsem);
3751         return ret;
3752 }
3753 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3754
3755 /*
3756  * Add the namespace to the controller list while keeping the list ordered.
3757  */
3758 static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
3759 {
3760         struct nvme_ns *tmp;
3761
3762         list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3763                 if (tmp->head->ns_id < ns->head->ns_id) {
3764                         list_add(&ns->list, &tmp->list);
3765                         return;
3766                 }
3767         }
3768         list_add(&ns->list, &ns->ctrl->namespaces);
3769 }
3770
3771 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
3772                 struct nvme_ns_ids *ids)
3773 {
3774         struct nvme_ns *ns;
3775         struct gendisk *disk;
3776         struct nvme_id_ns *id;
3777         int node = ctrl->numa_node;
3778
3779         if (nvme_identify_ns(ctrl, nsid, ids, &id))
3780                 return;
3781
3782         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3783         if (!ns)
3784                 goto out_free_id;
3785
3786         disk = blk_mq_alloc_disk(ctrl->tagset, ns);
3787         if (IS_ERR(disk))
3788                 goto out_free_ns;
3789         disk->fops = &nvme_bdev_ops;
3790         disk->private_data = ns;
3791
3792         ns->disk = disk;
3793         ns->queue = disk->queue;
3794
3795         if (ctrl->opts && ctrl->opts->data_digest)
3796                 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
3797
3798         blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3799         if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3800                 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
3801
3802         ns->ctrl = ctrl;
3803         kref_init(&ns->kref);
3804
3805         if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))
3806                 goto out_cleanup_disk;
3807
3808         /*
3809          * If multipathing is enabled, the device name for all disks and not
3810          * just those that represent shared namespaces needs to be based on the
3811          * subsystem instance.  Using the controller instance for private
3812          * namespaces could lead to naming collisions between shared and private
3813          * namespaces if they don't use a common numbering scheme.
3814          *
3815          * If multipathing is not enabled, disk names must use the controller
3816          * instance as shared namespaces will show up as multiple block
3817          * devices.
3818          */
3819         if (ns->head->disk) {
3820                 sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
3821                         ctrl->instance, ns->head->instance);
3822                 disk->flags |= GENHD_FL_HIDDEN;
3823         } else if (multipath) {
3824                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
3825                         ns->head->instance);
3826         } else {
3827                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
3828                         ns->head->instance);
3829         }
3830
3831         if (nvme_update_ns_info(ns, id))
3832                 goto out_unlink_ns;
3833
3834         down_write(&ctrl->namespaces_rwsem);
3835         nvme_ns_add_to_ctrl_list(ns);
3836         up_write(&ctrl->namespaces_rwsem);
3837         nvme_get_ctrl(ctrl);
3838
3839         if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
3840                 goto out_cleanup_ns_from_list;
3841
3842         if (!nvme_ns_head_multipath(ns->head))
3843                 nvme_add_ns_cdev(ns);
3844
3845         nvme_mpath_add_disk(ns, id);
3846         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3847         kfree(id);
3848
3849         return;
3850
3851  out_cleanup_ns_from_list:
3852         nvme_put_ctrl(ctrl);
3853         down_write(&ctrl->namespaces_rwsem);
3854         list_del_init(&ns->list);
3855         up_write(&ctrl->namespaces_rwsem);
3856  out_unlink_ns:
3857         mutex_lock(&ctrl->subsys->lock);
3858         list_del_rcu(&ns->siblings);
3859         if (list_empty(&ns->head->list))
3860                 list_del_init(&ns->head->entry);
3861         mutex_unlock(&ctrl->subsys->lock);
3862         nvme_put_ns_head(ns->head);
3863  out_cleanup_disk:
3864         blk_cleanup_disk(disk);
3865  out_free_ns:
3866         kfree(ns);
3867  out_free_id:
3868         kfree(id);
3869 }
3870
3871 static void nvme_ns_remove(struct nvme_ns *ns)
3872 {
3873         bool last_path = false;
3874
3875         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3876                 return;
3877
3878         clear_bit(NVME_NS_READY, &ns->flags);
3879         set_capacity(ns->disk, 0);
3880         nvme_fault_inject_fini(&ns->fault_inject);
3881
3882         mutex_lock(&ns->ctrl->subsys->lock);
3883         list_del_rcu(&ns->siblings);
3884         if (list_empty(&ns->head->list)) {
3885                 list_del_init(&ns->head->entry);
3886                 last_path = true;
3887         }
3888         mutex_unlock(&ns->ctrl->subsys->lock);
3889
3890         /* guarantee not available in head->list */
3891         synchronize_rcu();
3892
3893         /* wait for concurrent submissions */
3894         if (nvme_mpath_clear_current_path(ns))
3895                 synchronize_srcu(&ns->head->srcu);
3896
3897         if (!nvme_ns_head_multipath(ns->head))
3898                 nvme_cdev_del(&ns->cdev, &ns->cdev_device);
3899         del_gendisk(ns->disk);
3900         blk_cleanup_queue(ns->queue);
3901
3902         down_write(&ns->ctrl->namespaces_rwsem);
3903         list_del_init(&ns->list);
3904         up_write(&ns->ctrl->namespaces_rwsem);
3905
3906         if (last_path)
3907                 nvme_mpath_shutdown_disk(ns->head);
3908         nvme_put_ns(ns);
3909 }
3910
3911 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
3912 {
3913         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
3914
3915         if (ns) {
3916                 nvme_ns_remove(ns);
3917                 nvme_put_ns(ns);
3918         }
3919 }
3920
3921 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
3922 {
3923         struct nvme_id_ns *id;
3924         int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
3925
3926         if (test_bit(NVME_NS_DEAD, &ns->flags))
3927                 goto out;
3928
3929         ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
3930         if (ret)
3931                 goto out;
3932
3933         ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
3934         if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
3935                 dev_err(ns->ctrl->device,
3936                         "identifiers changed for nsid %d\n", ns->head->ns_id);
3937                 goto out_free_id;
3938         }
3939
3940         ret = nvme_update_ns_info(ns, id);
3941
3942 out_free_id:
3943         kfree(id);
3944 out:
3945         /*
3946          * Only remove the namespace if we got a fatal error back from the
3947          * device, otherwise ignore the error and just move on.
3948          *
3949          * TODO: we should probably schedule a delayed retry here.
3950          */
3951         if (ret > 0 && (ret & NVME_SC_DNR))
3952                 nvme_ns_remove(ns);
3953 }
3954
3955 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3956 {
3957         struct nvme_ns_ids ids = { };
3958         struct nvme_ns *ns;
3959
3960         if (nvme_identify_ns_descs(ctrl, nsid, &ids))
3961                 return;
3962
3963         ns = nvme_find_get_ns(ctrl, nsid);
3964         if (ns) {
3965                 nvme_validate_ns(ns, &ids);
3966                 nvme_put_ns(ns);
3967                 return;
3968         }
3969
3970         switch (ids.csi) {
3971         case NVME_CSI_NVM:
3972                 nvme_alloc_ns(ctrl, nsid, &ids);
3973                 break;
3974         case NVME_CSI_ZNS:
3975                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
3976                         dev_warn(ctrl->device,
3977                                 "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
3978                                 nsid);
3979                         break;
3980                 }
3981                 if (!nvme_multi_css(ctrl)) {
3982                         dev_warn(ctrl->device,
3983                                 "command set not reported for nsid: %d\n",
3984                                 nsid);
3985                         break;
3986                 }
3987                 nvme_alloc_ns(ctrl, nsid, &ids);
3988                 break;
3989         default:
3990                 dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
3991                         ids.csi, nsid);
3992                 break;
3993         }
3994 }
3995
3996 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
3997                                         unsigned nsid)
3998 {
3999         struct nvme_ns *ns, *next;
4000         LIST_HEAD(rm_list);
4001
4002         down_write(&ctrl->namespaces_rwsem);
4003         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4004                 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
4005                         list_move_tail(&ns->list, &rm_list);
4006         }
4007         up_write(&ctrl->namespaces_rwsem);
4008
4009         list_for_each_entry_safe(ns, next, &rm_list, list)
4010                 nvme_ns_remove(ns);
4011
4012 }
4013
4014 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4015 {
4016         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4017         __le32 *ns_list;
4018         u32 prev = 0;
4019         int ret = 0, i;
4020
4021         if (nvme_ctrl_limited_cns(ctrl))
4022                 return -EOPNOTSUPP;
4023
4024         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4025         if (!ns_list)
4026                 return -ENOMEM;
4027
4028         for (;;) {
4029                 struct nvme_command cmd = {
4030                         .identify.opcode        = nvme_admin_identify,
4031                         .identify.cns           = NVME_ID_CNS_NS_ACTIVE_LIST,
4032                         .identify.nsid          = cpu_to_le32(prev),
4033                 };
4034
4035                 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4036                                             NVME_IDENTIFY_DATA_SIZE);
4037                 if (ret) {
4038                         dev_warn(ctrl->device,
4039                                 "Identify NS List failed (status=0x%x)\n", ret);
4040                         goto free;
4041                 }
4042
4043                 for (i = 0; i < nr_entries; i++) {
4044                         u32 nsid = le32_to_cpu(ns_list[i]);
4045
4046                         if (!nsid)      /* end of the list? */
4047                                 goto out;
4048                         nvme_validate_or_alloc_ns(ctrl, nsid);
4049                         while (++prev < nsid)
4050                                 nvme_ns_remove_by_nsid(ctrl, prev);
4051                 }
4052         }
4053  out:
4054         nvme_remove_invalid_namespaces(ctrl, prev);
4055  free:
4056         kfree(ns_list);
4057         return ret;
4058 }
4059
4060 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4061 {
4062         struct nvme_id_ctrl *id;
4063         u32 nn, i;
4064
4065         if (nvme_identify_ctrl(ctrl, &id))
4066                 return;
4067         nn = le32_to_cpu(id->nn);
4068         kfree(id);
4069
4070         for (i = 1; i <= nn; i++)
4071                 nvme_validate_or_alloc_ns(ctrl, i);
4072
4073         nvme_remove_invalid_namespaces(ctrl, nn);
4074 }
4075
4076 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4077 {
4078         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4079         __le32 *log;
4080         int error;
4081
4082         log = kzalloc(log_size, GFP_KERNEL);
4083         if (!log)
4084                 return;
4085
4086         /*
4087          * We need to read the log to clear the AEN, but we don't want to rely
4088          * on it for the changed namespace information as userspace could have
4089          * raced with us in reading the log page, which could cause us to miss
4090          * updates.
4091          */
4092         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4093                         NVME_CSI_NVM, log, log_size, 0);
4094         if (error)
4095                 dev_warn(ctrl->device,
4096                         "reading changed ns log failed: %d\n", error);
4097
4098         kfree(log);
4099 }
4100
4101 static void nvme_scan_work(struct work_struct *work)
4102 {
4103         struct nvme_ctrl *ctrl =
4104                 container_of(work, struct nvme_ctrl, scan_work);
4105
4106         /* No tagset on a live ctrl means IO queues could not created */
4107         if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
4108                 return;
4109
4110         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4111                 dev_info(ctrl->device, "rescanning namespaces.\n");
4112                 nvme_clear_changed_ns_log(ctrl);
4113         }
4114
4115         mutex_lock(&ctrl->scan_lock);
4116         if (nvme_scan_ns_list(ctrl) != 0)
4117                 nvme_scan_ns_sequential(ctrl);
4118         mutex_unlock(&ctrl->scan_lock);
4119 }
4120
4121 /*
4122  * This function iterates the namespace list unlocked to allow recovery from
4123  * controller failure. It is up to the caller to ensure the namespace list is
4124  * not modified by scan work while this function is executing.
4125  */
4126 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4127 {
4128         struct nvme_ns *ns, *next;
4129         LIST_HEAD(ns_list);
4130
4131         /*
4132          * make sure to requeue I/O to all namespaces as these
4133          * might result from the scan itself and must complete
4134          * for the scan_work to make progress
4135          */
4136         nvme_mpath_clear_ctrl_paths(ctrl);
4137
4138         /* prevent racing with ns scanning */
4139         flush_work(&ctrl->scan_work);
4140
4141         /*
4142          * The dead states indicates the controller was not gracefully
4143          * disconnected. In that case, we won't be able to flush any data while
4144          * removing the namespaces' disks; fail all the queues now to avoid
4145          * potentially having to clean up the failed sync later.
4146          */
4147         if (ctrl->state == NVME_CTRL_DEAD)
4148                 nvme_kill_queues(ctrl);
4149
4150         /* this is a no-op when called from the controller reset handler */
4151         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4152
4153         down_write(&ctrl->namespaces_rwsem);
4154         list_splice_init(&ctrl->namespaces, &ns_list);
4155         up_write(&ctrl->namespaces_rwsem);
4156
4157         list_for_each_entry_safe(ns, next, &ns_list, list)
4158                 nvme_ns_remove(ns);
4159 }
4160 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4161
4162 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
4163 {
4164         struct nvme_ctrl *ctrl =
4165                 container_of(dev, struct nvme_ctrl, ctrl_device);
4166         struct nvmf_ctrl_options *opts = ctrl->opts;
4167         int ret;
4168
4169         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4170         if (ret)
4171                 return ret;
4172
4173         if (opts) {
4174                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4175                 if (ret)
4176                         return ret;
4177
4178                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4179                                 opts->trsvcid ?: "none");
4180                 if (ret)
4181                         return ret;
4182
4183                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4184                                 opts->host_traddr ?: "none");
4185                 if (ret)
4186                         return ret;
4187
4188                 ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
4189                                 opts->host_iface ?: "none");
4190         }
4191         return ret;
4192 }
4193
4194 static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
4195 {
4196         char *envp[2] = { envdata, NULL };
4197
4198         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4199 }
4200
4201 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4202 {
4203         char *envp[2] = { NULL, NULL };
4204         u32 aen_result = ctrl->aen_result;
4205
4206         ctrl->aen_result = 0;
4207         if (!aen_result)
4208                 return;
4209
4210         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4211         if (!envp[0])
4212                 return;
4213         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4214         kfree(envp[0]);
4215 }
4216
4217 static void nvme_async_event_work(struct work_struct *work)
4218 {
4219         struct nvme_ctrl *ctrl =
4220                 container_of(work, struct nvme_ctrl, async_event_work);
4221
4222         nvme_aen_uevent(ctrl);
4223
4224         /*
4225          * The transport drivers must guarantee AER submission here is safe by
4226          * flushing ctrl async_event_work after changing the controller state
4227          * from LIVE and before freeing the admin queue.
4228         */
4229         if (ctrl->state == NVME_CTRL_LIVE)
4230                 ctrl->ops->submit_async_event(ctrl);
4231 }
4232
4233 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4234 {
4235
4236         u32 csts;
4237
4238         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4239                 return false;
4240
4241         if (csts == ~0)
4242                 return false;
4243
4244         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4245 }
4246
4247 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4248 {
4249         struct nvme_fw_slot_info_log *log;
4250
4251         log = kmalloc(sizeof(*log), GFP_KERNEL);
4252         if (!log)
4253                 return;
4254
4255         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4256                         log, sizeof(*log), 0))
4257                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4258         kfree(log);
4259 }
4260
4261 static void nvme_fw_act_work(struct work_struct *work)
4262 {
4263         struct nvme_ctrl *ctrl = container_of(work,
4264                                 struct nvme_ctrl, fw_act_work);
4265         unsigned long fw_act_timeout;
4266
4267         if (ctrl->mtfa)
4268                 fw_act_timeout = jiffies +
4269                                 msecs_to_jiffies(ctrl->mtfa * 100);
4270         else
4271                 fw_act_timeout = jiffies +
4272                                 msecs_to_jiffies(admin_timeout * 1000);
4273
4274         nvme_stop_queues(ctrl);
4275         while (nvme_ctrl_pp_status(ctrl)) {
4276                 if (time_after(jiffies, fw_act_timeout)) {
4277                         dev_warn(ctrl->device,
4278                                 "Fw activation timeout, reset controller\n");
4279                         nvme_try_sched_reset(ctrl);
4280                         return;
4281                 }
4282                 msleep(100);
4283         }
4284
4285         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4286                 return;
4287
4288         nvme_start_queues(ctrl);
4289         /* read FW slot information to clear the AER */
4290         nvme_get_fw_slot_info(ctrl);
4291 }
4292
4293 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4294 {
4295         u32 aer_notice_type = (result & 0xff00) >> 8;
4296
4297         trace_nvme_async_event(ctrl, aer_notice_type);
4298
4299         switch (aer_notice_type) {
4300         case NVME_AER_NOTICE_NS_CHANGED:
4301                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4302                 nvme_queue_scan(ctrl);
4303                 break;
4304         case NVME_AER_NOTICE_FW_ACT_STARTING:
4305                 /*
4306                  * We are (ab)using the RESETTING state to prevent subsequent
4307                  * recovery actions from interfering with the controller's
4308                  * firmware activation.
4309                  */
4310                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
4311                         queue_work(nvme_wq, &ctrl->fw_act_work);
4312                 break;
4313 #ifdef CONFIG_NVME_MULTIPATH
4314         case NVME_AER_NOTICE_ANA:
4315                 if (!ctrl->ana_log_buf)
4316                         break;
4317                 queue_work(nvme_wq, &ctrl->ana_work);
4318                 break;
4319 #endif
4320         case NVME_AER_NOTICE_DISC_CHANGED:
4321                 ctrl->aen_result = result;
4322                 break;
4323         default:
4324                 dev_warn(ctrl->device, "async event result %08x\n", result);
4325         }
4326 }
4327
4328 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4329                 volatile union nvme_result *res)
4330 {
4331         u32 result = le32_to_cpu(res->u32);
4332         u32 aer_type = result & 0x07;
4333
4334         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4335                 return;
4336
4337         switch (aer_type) {
4338         case NVME_AER_NOTICE:
4339                 nvme_handle_aen_notice(ctrl, result);
4340                 break;
4341         case NVME_AER_ERROR:
4342         case NVME_AER_SMART:
4343         case NVME_AER_CSS:
4344         case NVME_AER_VS:
4345                 trace_nvme_async_event(ctrl, aer_type);
4346                 ctrl->aen_result = result;
4347                 break;
4348         default:
4349                 break;
4350         }
4351         queue_work(nvme_wq, &ctrl->async_event_work);
4352 }
4353 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4354
4355 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4356 {
4357         nvme_mpath_stop(ctrl);
4358         nvme_stop_keep_alive(ctrl);
4359         nvme_stop_failfast_work(ctrl);
4360         flush_work(&ctrl->async_event_work);
4361         cancel_work_sync(&ctrl->fw_act_work);
4362 }
4363 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4364
4365 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4366 {
4367         nvme_start_keep_alive(ctrl);
4368
4369         nvme_enable_aen(ctrl);
4370
4371         if (ctrl->queue_count > 1) {
4372                 nvme_queue_scan(ctrl);
4373                 nvme_start_queues(ctrl);
4374         }
4375
4376         nvme_change_uevent(ctrl, "NVME_EVENT=connected");
4377 }
4378 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4379
4380 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4381 {
4382         nvme_hwmon_exit(ctrl);
4383         nvme_fault_inject_fini(&ctrl->fault_inject);
4384         dev_pm_qos_hide_latency_tolerance(ctrl->device);
4385         cdev_device_del(&ctrl->cdev, ctrl->device);
4386         nvme_put_ctrl(ctrl);
4387 }
4388 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4389
4390 static void nvme_free_cels(struct nvme_ctrl *ctrl)
4391 {
4392         struct nvme_effects_log *cel;
4393         unsigned long i;
4394
4395         xa_for_each(&ctrl->cels, i, cel) {
4396                 xa_erase(&ctrl->cels, i);
4397                 kfree(cel);
4398         }
4399
4400         xa_destroy(&ctrl->cels);
4401 }
4402
4403 static void nvme_free_ctrl(struct device *dev)
4404 {
4405         struct nvme_ctrl *ctrl =
4406                 container_of(dev, struct nvme_ctrl, ctrl_device);
4407         struct nvme_subsystem *subsys = ctrl->subsys;
4408
4409         if (!subsys || ctrl->instance != subsys->instance)
4410                 ida_free(&nvme_instance_ida, ctrl->instance);
4411
4412         nvme_free_cels(ctrl);
4413         nvme_mpath_uninit(ctrl);
4414         __free_page(ctrl->discard_page);
4415
4416         if (subsys) {
4417                 mutex_lock(&nvme_subsystems_lock);
4418                 list_del(&ctrl->subsys_entry);
4419                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4420                 mutex_unlock(&nvme_subsystems_lock);
4421         }
4422
4423         ctrl->ops->free_ctrl(ctrl);
4424
4425         if (subsys)
4426                 nvme_put_subsystem(subsys);
4427 }
4428
4429 /*
4430  * Initialize a NVMe controller structures.  This needs to be called during
4431  * earliest initialization so that we have the initialized structured around
4432  * during probing.
4433  */
4434 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4435                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4436 {
4437         int ret;
4438
4439         ctrl->state = NVME_CTRL_NEW;
4440         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
4441         spin_lock_init(&ctrl->lock);
4442         mutex_init(&ctrl->scan_lock);
4443         INIT_LIST_HEAD(&ctrl->namespaces);
4444         xa_init(&ctrl->cels);
4445         init_rwsem(&ctrl->namespaces_rwsem);
4446         ctrl->dev = dev;
4447         ctrl->ops = ops;
4448         ctrl->quirks = quirks;
4449         ctrl->numa_node = NUMA_NO_NODE;
4450         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4451         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4452         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4453         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4454         init_waitqueue_head(&ctrl->state_wq);
4455
4456         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4457         INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
4458         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4459         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4460
4461         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4462                         PAGE_SIZE);
4463         ctrl->discard_page = alloc_page(GFP_KERNEL);
4464         if (!ctrl->discard_page) {
4465                 ret = -ENOMEM;
4466                 goto out;
4467         }
4468
4469         ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
4470         if (ret < 0)
4471                 goto out;
4472         ctrl->instance = ret;
4473
4474         device_initialize(&ctrl->ctrl_device);
4475         ctrl->device = &ctrl->ctrl_device;
4476         ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
4477                         ctrl->instance);
4478         ctrl->device->class = nvme_class;
4479         ctrl->device->parent = ctrl->dev;
4480         ctrl->device->groups = nvme_dev_attr_groups;
4481         ctrl->device->release = nvme_free_ctrl;
4482         dev_set_drvdata(ctrl->device, ctrl);
4483         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4484         if (ret)
4485                 goto out_release_instance;
4486
4487         nvme_get_ctrl(ctrl);
4488         cdev_init(&ctrl->cdev, &nvme_dev_fops);
4489         ctrl->cdev.owner = ops->module;
4490         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4491         if (ret)
4492                 goto out_free_name;
4493
4494         /*
4495          * Initialize latency tolerance controls.  The sysfs files won't
4496          * be visible to userspace unless the device actually supports APST.
4497          */
4498         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4499         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4500                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4501
4502         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4503         nvme_mpath_init_ctrl(ctrl);
4504
4505         return 0;
4506 out_free_name:
4507         nvme_put_ctrl(ctrl);
4508         kfree_const(ctrl->device->kobj.name);
4509 out_release_instance:
4510         ida_free(&nvme_instance_ida, ctrl->instance);
4511 out:
4512         if (ctrl->discard_page)
4513                 __free_page(ctrl->discard_page);
4514         return ret;
4515 }
4516 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4517
4518 static void nvme_start_ns_queue(struct nvme_ns *ns)
4519 {
4520         if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
4521                 blk_mq_unquiesce_queue(ns->queue);
4522 }
4523
4524 static void nvme_stop_ns_queue(struct nvme_ns *ns)
4525 {
4526         if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
4527                 blk_mq_quiesce_queue(ns->queue);
4528         else
4529                 blk_mq_wait_quiesce_done(ns->queue);
4530 }
4531
4532 /*
4533  * Prepare a queue for teardown.
4534  *
4535  * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
4536  * the capacity to 0 after that to avoid blocking dispatchers that may be
4537  * holding bd_butex.  This will end buffered writers dirtying pages that can't
4538  * be synced.
4539  */
4540 static void nvme_set_queue_dying(struct nvme_ns *ns)
4541 {
4542         if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
4543                 return;
4544
4545         blk_mark_disk_dead(ns->disk);
4546         nvme_start_ns_queue(ns);
4547
4548         set_capacity_and_notify(ns->disk, 0);
4549 }
4550
4551 /**
4552  * nvme_kill_queues(): Ends all namespace queues
4553  * @ctrl: the dead controller that needs to end
4554  *
4555  * Call this function when the driver determines it is unable to get the
4556  * controller in a state capable of servicing IO.
4557  */
4558 void nvme_kill_queues(struct nvme_ctrl *ctrl)
4559 {
4560         struct nvme_ns *ns;
4561
4562         down_read(&ctrl->namespaces_rwsem);
4563
4564         /* Forcibly unquiesce queues to avoid blocking dispatch */
4565         if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
4566                 nvme_start_admin_queue(ctrl);
4567
4568         list_for_each_entry(ns, &ctrl->namespaces, list)
4569                 nvme_set_queue_dying(ns);
4570
4571         up_read(&ctrl->namespaces_rwsem);
4572 }
4573 EXPORT_SYMBOL_GPL(nvme_kill_queues);
4574
4575 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4576 {
4577         struct nvme_ns *ns;
4578
4579         down_read(&ctrl->namespaces_rwsem);
4580         list_for_each_entry(ns, &ctrl->namespaces, list)
4581                 blk_mq_unfreeze_queue(ns->queue);
4582         up_read(&ctrl->namespaces_rwsem);
4583 }
4584 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4585
4586 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4587 {
4588         struct nvme_ns *ns;
4589
4590         down_read(&ctrl->namespaces_rwsem);
4591         list_for_each_entry(ns, &ctrl->namespaces, list) {
4592                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4593                 if (timeout <= 0)
4594                         break;
4595         }
4596         up_read(&ctrl->namespaces_rwsem);
4597         return timeout;
4598 }
4599 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4600
4601 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4602 {
4603         struct nvme_ns *ns;
4604
4605         down_read(&ctrl->namespaces_rwsem);
4606         list_for_each_entry(ns, &ctrl->namespaces, list)
4607                 blk_mq_freeze_queue_wait(ns->queue);
4608         up_read(&ctrl->namespaces_rwsem);
4609 }
4610 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4611
4612 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4613 {
4614         struct nvme_ns *ns;
4615
4616         down_read(&ctrl->namespaces_rwsem);
4617         list_for_each_entry(ns, &ctrl->namespaces, list)
4618                 blk_freeze_queue_start(ns->queue);
4619         up_read(&ctrl->namespaces_rwsem);
4620 }
4621 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4622
4623 void nvme_stop_queues(struct nvme_ctrl *ctrl)
4624 {
4625         struct nvme_ns *ns;
4626
4627         down_read(&ctrl->namespaces_rwsem);
4628         list_for_each_entry(ns, &ctrl->namespaces, list)
4629                 nvme_stop_ns_queue(ns);
4630         up_read(&ctrl->namespaces_rwsem);
4631 }
4632 EXPORT_SYMBOL_GPL(nvme_stop_queues);
4633
4634 void nvme_start_queues(struct nvme_ctrl *ctrl)
4635 {
4636         struct nvme_ns *ns;
4637
4638         down_read(&ctrl->namespaces_rwsem);
4639         list_for_each_entry(ns, &ctrl->namespaces, list)
4640                 nvme_start_ns_queue(ns);
4641         up_read(&ctrl->namespaces_rwsem);
4642 }
4643 EXPORT_SYMBOL_GPL(nvme_start_queues);
4644
4645 void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
4646 {
4647         if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4648                 blk_mq_quiesce_queue(ctrl->admin_q);
4649         else
4650                 blk_mq_wait_quiesce_done(ctrl->admin_q);
4651 }
4652 EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
4653
4654 void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
4655 {
4656         if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4657                 blk_mq_unquiesce_queue(ctrl->admin_q);
4658 }
4659 EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
4660
4661 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4662 {
4663         struct nvme_ns *ns;
4664
4665         down_read(&ctrl->namespaces_rwsem);
4666         list_for_each_entry(ns, &ctrl->namespaces, list)
4667                 blk_sync_queue(ns->queue);
4668         up_read(&ctrl->namespaces_rwsem);
4669 }
4670 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
4671
4672 void nvme_sync_queues(struct nvme_ctrl *ctrl)
4673 {
4674         nvme_sync_io_queues(ctrl);
4675         if (ctrl->admin_q)
4676                 blk_sync_queue(ctrl->admin_q);
4677 }
4678 EXPORT_SYMBOL_GPL(nvme_sync_queues);
4679
4680 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4681 {
4682         if (file->f_op != &nvme_dev_fops)
4683                 return NULL;
4684         return file->private_data;
4685 }
4686 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4687
4688 /*
4689  * Check we didn't inadvertently grow the command structure sizes:
4690  */
4691 static inline void _nvme_check_size(void)
4692 {
4693         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4694         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4695         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4696         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4697         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4698         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4699         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4700         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4701         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4702         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
4703         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
4704         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
4705         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4706         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4707         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4708         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
4709         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
4710         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
4711         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
4712         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4713 }
4714
4715
4716 static int __init nvme_core_init(void)
4717 {
4718         int result = -ENOMEM;
4719
4720         _nvme_check_size();
4721
4722         nvme_wq = alloc_workqueue("nvme-wq",
4723                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4724         if (!nvme_wq)
4725                 goto out;
4726
4727         nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
4728                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4729         if (!nvme_reset_wq)
4730                 goto destroy_wq;
4731
4732         nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
4733                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4734         if (!nvme_delete_wq)
4735                 goto destroy_reset_wq;
4736
4737         result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
4738                         NVME_MINORS, "nvme");
4739         if (result < 0)
4740                 goto destroy_delete_wq;
4741
4742         nvme_class = class_create(THIS_MODULE, "nvme");
4743         if (IS_ERR(nvme_class)) {
4744                 result = PTR_ERR(nvme_class);
4745                 goto unregister_chrdev;
4746         }
4747         nvme_class->dev_uevent = nvme_class_uevent;
4748
4749         nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
4750         if (IS_ERR(nvme_subsys_class)) {
4751                 result = PTR_ERR(nvme_subsys_class);
4752                 goto destroy_class;
4753         }
4754
4755         result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
4756                                      "nvme-generic");
4757         if (result < 0)
4758                 goto destroy_subsys_class;
4759
4760         nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic");
4761         if (IS_ERR(nvme_ns_chr_class)) {
4762                 result = PTR_ERR(nvme_ns_chr_class);
4763                 goto unregister_generic_ns;
4764         }
4765
4766         return 0;
4767
4768 unregister_generic_ns:
4769         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
4770 destroy_subsys_class:
4771         class_destroy(nvme_subsys_class);
4772 destroy_class:
4773         class_destroy(nvme_class);
4774 unregister_chrdev:
4775         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
4776 destroy_delete_wq:
4777         destroy_workqueue(nvme_delete_wq);
4778 destroy_reset_wq:
4779         destroy_workqueue(nvme_reset_wq);
4780 destroy_wq:
4781         destroy_workqueue(nvme_wq);
4782 out:
4783         return result;
4784 }
4785
4786 static void __exit nvme_core_exit(void)
4787 {
4788         class_destroy(nvme_ns_chr_class);
4789         class_destroy(nvme_subsys_class);
4790         class_destroy(nvme_class);
4791         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
4792         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
4793         destroy_workqueue(nvme_delete_wq);
4794         destroy_workqueue(nvme_reset_wq);
4795         destroy_workqueue(nvme_wq);
4796         ida_destroy(&nvme_ns_chr_minor_ida);
4797         ida_destroy(&nvme_instance_ida);
4798 }
4799
4800 MODULE_LICENSE("GPL");
4801 MODULE_VERSION("1.0");
4802 module_init(nvme_core_init);
4803 module_exit(nvme_core_exit);