Merge tag 's390-5.18-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
[linux-2.6-microblaze.git] / drivers / nvme / host / core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVM Express device driver
4  * Copyright (c) 2011-2014, Intel Corporation.
5  */
6
7 #include <linux/blkdev.h>
8 #include <linux/blk-mq.h>
9 #include <linux/blk-integrity.h>
10 #include <linux/compat.h>
11 #include <linux/delay.h>
12 #include <linux/errno.h>
13 #include <linux/hdreg.h>
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/backing-dev.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/pr.h>
20 #include <linux/ptrace.h>
21 #include <linux/nvme_ioctl.h>
22 #include <linux/pm_qos.h>
23 #include <asm/unaligned.h>
24
25 #include "nvme.h"
26 #include "fabrics.h"
27
28 #define CREATE_TRACE_POINTS
29 #include "trace.h"
30
31 #define NVME_MINORS             (1U << MINORBITS)
32
33 unsigned int admin_timeout = 60;
34 module_param(admin_timeout, uint, 0644);
35 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
36 EXPORT_SYMBOL_GPL(admin_timeout);
37
38 unsigned int nvme_io_timeout = 30;
39 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
40 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
41 EXPORT_SYMBOL_GPL(nvme_io_timeout);
42
43 static unsigned char shutdown_timeout = 5;
44 module_param(shutdown_timeout, byte, 0644);
45 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
46
47 static u8 nvme_max_retries = 5;
48 module_param_named(max_retries, nvme_max_retries, byte, 0644);
49 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
50
51 static unsigned long default_ps_max_latency_us = 100000;
52 module_param(default_ps_max_latency_us, ulong, 0644);
53 MODULE_PARM_DESC(default_ps_max_latency_us,
54                  "max power saving latency for new devices; use PM QOS to change per device");
55
56 static bool force_apst;
57 module_param(force_apst, bool, 0644);
58 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
59
60 static unsigned long apst_primary_timeout_ms = 100;
61 module_param(apst_primary_timeout_ms, ulong, 0644);
62 MODULE_PARM_DESC(apst_primary_timeout_ms,
63         "primary APST timeout in ms");
64
65 static unsigned long apst_secondary_timeout_ms = 2000;
66 module_param(apst_secondary_timeout_ms, ulong, 0644);
67 MODULE_PARM_DESC(apst_secondary_timeout_ms,
68         "secondary APST timeout in ms");
69
70 static unsigned long apst_primary_latency_tol_us = 15000;
71 module_param(apst_primary_latency_tol_us, ulong, 0644);
72 MODULE_PARM_DESC(apst_primary_latency_tol_us,
73         "primary APST latency tolerance in us");
74
75 static unsigned long apst_secondary_latency_tol_us = 100000;
76 module_param(apst_secondary_latency_tol_us, ulong, 0644);
77 MODULE_PARM_DESC(apst_secondary_latency_tol_us,
78         "secondary APST latency tolerance in us");
79
80 static bool streams;
81 module_param(streams, bool, 0644);
82 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
83
84 /*
85  * nvme_wq - hosts nvme related works that are not reset or delete
86  * nvme_reset_wq - hosts nvme reset works
87  * nvme_delete_wq - hosts nvme delete works
88  *
89  * nvme_wq will host works such as scan, aen handling, fw activation,
90  * keep-alive, periodic reconnects etc. nvme_reset_wq
91  * runs reset works which also flush works hosted on nvme_wq for
92  * serialization purposes. nvme_delete_wq host controller deletion
93  * works which flush reset works for serialization.
94  */
95 struct workqueue_struct *nvme_wq;
96 EXPORT_SYMBOL_GPL(nvme_wq);
97
98 struct workqueue_struct *nvme_reset_wq;
99 EXPORT_SYMBOL_GPL(nvme_reset_wq);
100
101 struct workqueue_struct *nvme_delete_wq;
102 EXPORT_SYMBOL_GPL(nvme_delete_wq);
103
104 static LIST_HEAD(nvme_subsystems);
105 static DEFINE_MUTEX(nvme_subsystems_lock);
106
107 static DEFINE_IDA(nvme_instance_ida);
108 static dev_t nvme_ctrl_base_chr_devt;
109 static struct class *nvme_class;
110 static struct class *nvme_subsys_class;
111
112 static DEFINE_IDA(nvme_ns_chr_minor_ida);
113 static dev_t nvme_ns_chr_devt;
114 static struct class *nvme_ns_chr_class;
115
116 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
117 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
118                                            unsigned nsid);
119 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
120                                    struct nvme_command *cmd);
121
122 void nvme_queue_scan(struct nvme_ctrl *ctrl)
123 {
124         /*
125          * Only new queue scan work when admin and IO queues are both alive
126          */
127         if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
128                 queue_work(nvme_wq, &ctrl->scan_work);
129 }
130
131 /*
132  * Use this function to proceed with scheduling reset_work for a controller
133  * that had previously been set to the resetting state. This is intended for
134  * code paths that can't be interrupted by other reset attempts. A hot removal
135  * may prevent this from succeeding.
136  */
137 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
138 {
139         if (ctrl->state != NVME_CTRL_RESETTING)
140                 return -EBUSY;
141         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
142                 return -EBUSY;
143         return 0;
144 }
145 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
146
147 static void nvme_failfast_work(struct work_struct *work)
148 {
149         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
150                         struct nvme_ctrl, failfast_work);
151
152         if (ctrl->state != NVME_CTRL_CONNECTING)
153                 return;
154
155         set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
156         dev_info(ctrl->device, "failfast expired\n");
157         nvme_kick_requeue_lists(ctrl);
158 }
159
160 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
161 {
162         if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
163                 return;
164
165         schedule_delayed_work(&ctrl->failfast_work,
166                               ctrl->opts->fast_io_fail_tmo * HZ);
167 }
168
169 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
170 {
171         if (!ctrl->opts)
172                 return;
173
174         cancel_delayed_work_sync(&ctrl->failfast_work);
175         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
176 }
177
178
179 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
180 {
181         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
182                 return -EBUSY;
183         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
184                 return -EBUSY;
185         return 0;
186 }
187 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
188
189 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
190 {
191         int ret;
192
193         ret = nvme_reset_ctrl(ctrl);
194         if (!ret) {
195                 flush_work(&ctrl->reset_work);
196                 if (ctrl->state != NVME_CTRL_LIVE)
197                         ret = -ENETRESET;
198         }
199
200         return ret;
201 }
202
203 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
204 {
205         dev_info(ctrl->device,
206                  "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
207
208         flush_work(&ctrl->reset_work);
209         nvme_stop_ctrl(ctrl);
210         nvme_remove_namespaces(ctrl);
211         ctrl->ops->delete_ctrl(ctrl);
212         nvme_uninit_ctrl(ctrl);
213 }
214
215 static void nvme_delete_ctrl_work(struct work_struct *work)
216 {
217         struct nvme_ctrl *ctrl =
218                 container_of(work, struct nvme_ctrl, delete_work);
219
220         nvme_do_delete_ctrl(ctrl);
221 }
222
223 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
224 {
225         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
226                 return -EBUSY;
227         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
228                 return -EBUSY;
229         return 0;
230 }
231 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
232
233 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
234 {
235         /*
236          * Keep a reference until nvme_do_delete_ctrl() complete,
237          * since ->delete_ctrl can free the controller.
238          */
239         nvme_get_ctrl(ctrl);
240         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
241                 nvme_do_delete_ctrl(ctrl);
242         nvme_put_ctrl(ctrl);
243 }
244
245 static blk_status_t nvme_error_status(u16 status)
246 {
247         switch (status & 0x7ff) {
248         case NVME_SC_SUCCESS:
249                 return BLK_STS_OK;
250         case NVME_SC_CAP_EXCEEDED:
251                 return BLK_STS_NOSPC;
252         case NVME_SC_LBA_RANGE:
253         case NVME_SC_CMD_INTERRUPTED:
254         case NVME_SC_NS_NOT_READY:
255                 return BLK_STS_TARGET;
256         case NVME_SC_BAD_ATTRIBUTES:
257         case NVME_SC_ONCS_NOT_SUPPORTED:
258         case NVME_SC_INVALID_OPCODE:
259         case NVME_SC_INVALID_FIELD:
260         case NVME_SC_INVALID_NS:
261                 return BLK_STS_NOTSUPP;
262         case NVME_SC_WRITE_FAULT:
263         case NVME_SC_READ_ERROR:
264         case NVME_SC_UNWRITTEN_BLOCK:
265         case NVME_SC_ACCESS_DENIED:
266         case NVME_SC_READ_ONLY:
267         case NVME_SC_COMPARE_FAILED:
268                 return BLK_STS_MEDIUM;
269         case NVME_SC_GUARD_CHECK:
270         case NVME_SC_APPTAG_CHECK:
271         case NVME_SC_REFTAG_CHECK:
272         case NVME_SC_INVALID_PI:
273                 return BLK_STS_PROTECTION;
274         case NVME_SC_RESERVATION_CONFLICT:
275                 return BLK_STS_NEXUS;
276         case NVME_SC_HOST_PATH_ERROR:
277                 return BLK_STS_TRANSPORT;
278         case NVME_SC_ZONE_TOO_MANY_ACTIVE:
279                 return BLK_STS_ZONE_ACTIVE_RESOURCE;
280         case NVME_SC_ZONE_TOO_MANY_OPEN:
281                 return BLK_STS_ZONE_OPEN_RESOURCE;
282         default:
283                 return BLK_STS_IOERR;
284         }
285 }
286
287 static void nvme_retry_req(struct request *req)
288 {
289         unsigned long delay = 0;
290         u16 crd;
291
292         /* The mask and shift result must be <= 3 */
293         crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
294         if (crd)
295                 delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
296
297         nvme_req(req)->retries++;
298         blk_mq_requeue_request(req, false);
299         blk_mq_delay_kick_requeue_list(req->q, delay);
300 }
301
302 static void nvme_log_error(struct request *req)
303 {
304         struct nvme_ns *ns = req->q->queuedata;
305         struct nvme_request *nr = nvme_req(req);
306
307         if (ns) {
308                 pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
309                        ns->disk ? ns->disk->disk_name : "?",
310                        nvme_get_opcode_str(nr->cmd->common.opcode),
311                        nr->cmd->common.opcode,
312                        (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
313                        (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
314                        nvme_get_error_status_str(nr->status),
315                        nr->status >> 8 & 7,     /* Status Code Type */
316                        nr->status & 0xff,       /* Status Code */
317                        nr->status & NVME_SC_MORE ? "MORE " : "",
318                        nr->status & NVME_SC_DNR  ? "DNR "  : "");
319                 return;
320         }
321
322         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
323                            dev_name(nr->ctrl->device),
324                            nvme_get_admin_opcode_str(nr->cmd->common.opcode),
325                            nr->cmd->common.opcode,
326                            nvme_get_error_status_str(nr->status),
327                            nr->status >> 8 & 7, /* Status Code Type */
328                            nr->status & 0xff,   /* Status Code */
329                            nr->status & NVME_SC_MORE ? "MORE " : "",
330                            nr->status & NVME_SC_DNR  ? "DNR "  : "");
331 }
332
333 enum nvme_disposition {
334         COMPLETE,
335         RETRY,
336         FAILOVER,
337 };
338
339 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
340 {
341         if (likely(nvme_req(req)->status == 0))
342                 return COMPLETE;
343
344         if (blk_noretry_request(req) ||
345             (nvme_req(req)->status & NVME_SC_DNR) ||
346             nvme_req(req)->retries >= nvme_max_retries)
347                 return COMPLETE;
348
349         if (req->cmd_flags & REQ_NVME_MPATH) {
350                 if (nvme_is_path_error(nvme_req(req)->status) ||
351                     blk_queue_dying(req->q))
352                         return FAILOVER;
353         } else {
354                 if (blk_queue_dying(req->q))
355                         return COMPLETE;
356         }
357
358         return RETRY;
359 }
360
361 static inline void nvme_end_req_zoned(struct request *req)
362 {
363         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
364             req_op(req) == REQ_OP_ZONE_APPEND)
365                 req->__sector = nvme_lba_to_sect(req->q->queuedata,
366                         le64_to_cpu(nvme_req(req)->result.u64));
367 }
368
369 static inline void nvme_end_req(struct request *req)
370 {
371         blk_status_t status = nvme_error_status(nvme_req(req)->status);
372
373         if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS))
374                 nvme_log_error(req);
375         nvme_end_req_zoned(req);
376         nvme_trace_bio_complete(req);
377         blk_mq_end_request(req, status);
378 }
379
380 void nvme_complete_rq(struct request *req)
381 {
382         trace_nvme_complete_rq(req);
383         nvme_cleanup_cmd(req);
384
385         if (nvme_req(req)->ctrl->kas)
386                 nvme_req(req)->ctrl->comp_seen = true;
387
388         switch (nvme_decide_disposition(req)) {
389         case COMPLETE:
390                 nvme_end_req(req);
391                 return;
392         case RETRY:
393                 nvme_retry_req(req);
394                 return;
395         case FAILOVER:
396                 nvme_failover_req(req);
397                 return;
398         }
399 }
400 EXPORT_SYMBOL_GPL(nvme_complete_rq);
401
402 void nvme_complete_batch_req(struct request *req)
403 {
404         trace_nvme_complete_rq(req);
405         nvme_cleanup_cmd(req);
406         nvme_end_req_zoned(req);
407 }
408 EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
409
410 /*
411  * Called to unwind from ->queue_rq on a failed command submission so that the
412  * multipathing code gets called to potentially failover to another path.
413  * The caller needs to unwind all transport specific resource allocations and
414  * must return propagate the return value.
415  */
416 blk_status_t nvme_host_path_error(struct request *req)
417 {
418         nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
419         blk_mq_set_request_complete(req);
420         nvme_complete_rq(req);
421         return BLK_STS_OK;
422 }
423 EXPORT_SYMBOL_GPL(nvme_host_path_error);
424
425 bool nvme_cancel_request(struct request *req, void *data, bool reserved)
426 {
427         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
428                                 "Cancelling I/O %d", req->tag);
429
430         /* don't abort one completed request */
431         if (blk_mq_request_completed(req))
432                 return true;
433
434         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
435         nvme_req(req)->flags |= NVME_REQ_CANCELLED;
436         blk_mq_complete_request(req);
437         return true;
438 }
439 EXPORT_SYMBOL_GPL(nvme_cancel_request);
440
441 void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
442 {
443         if (ctrl->tagset) {
444                 blk_mq_tagset_busy_iter(ctrl->tagset,
445                                 nvme_cancel_request, ctrl);
446                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
447         }
448 }
449 EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
450
451 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
452 {
453         if (ctrl->admin_tagset) {
454                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
455                                 nvme_cancel_request, ctrl);
456                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
457         }
458 }
459 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
460
461 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
462                 enum nvme_ctrl_state new_state)
463 {
464         enum nvme_ctrl_state old_state;
465         unsigned long flags;
466         bool changed = false;
467
468         spin_lock_irqsave(&ctrl->lock, flags);
469
470         old_state = ctrl->state;
471         switch (new_state) {
472         case NVME_CTRL_LIVE:
473                 switch (old_state) {
474                 case NVME_CTRL_NEW:
475                 case NVME_CTRL_RESETTING:
476                 case NVME_CTRL_CONNECTING:
477                         changed = true;
478                         fallthrough;
479                 default:
480                         break;
481                 }
482                 break;
483         case NVME_CTRL_RESETTING:
484                 switch (old_state) {
485                 case NVME_CTRL_NEW:
486                 case NVME_CTRL_LIVE:
487                         changed = true;
488                         fallthrough;
489                 default:
490                         break;
491                 }
492                 break;
493         case NVME_CTRL_CONNECTING:
494                 switch (old_state) {
495                 case NVME_CTRL_NEW:
496                 case NVME_CTRL_RESETTING:
497                         changed = true;
498                         fallthrough;
499                 default:
500                         break;
501                 }
502                 break;
503         case NVME_CTRL_DELETING:
504                 switch (old_state) {
505                 case NVME_CTRL_LIVE:
506                 case NVME_CTRL_RESETTING:
507                 case NVME_CTRL_CONNECTING:
508                         changed = true;
509                         fallthrough;
510                 default:
511                         break;
512                 }
513                 break;
514         case NVME_CTRL_DELETING_NOIO:
515                 switch (old_state) {
516                 case NVME_CTRL_DELETING:
517                 case NVME_CTRL_DEAD:
518                         changed = true;
519                         fallthrough;
520                 default:
521                         break;
522                 }
523                 break;
524         case NVME_CTRL_DEAD:
525                 switch (old_state) {
526                 case NVME_CTRL_DELETING:
527                         changed = true;
528                         fallthrough;
529                 default:
530                         break;
531                 }
532                 break;
533         default:
534                 break;
535         }
536
537         if (changed) {
538                 ctrl->state = new_state;
539                 wake_up_all(&ctrl->state_wq);
540         }
541
542         spin_unlock_irqrestore(&ctrl->lock, flags);
543         if (!changed)
544                 return false;
545
546         if (ctrl->state == NVME_CTRL_LIVE) {
547                 if (old_state == NVME_CTRL_CONNECTING)
548                         nvme_stop_failfast_work(ctrl);
549                 nvme_kick_requeue_lists(ctrl);
550         } else if (ctrl->state == NVME_CTRL_CONNECTING &&
551                 old_state == NVME_CTRL_RESETTING) {
552                 nvme_start_failfast_work(ctrl);
553         }
554         return changed;
555 }
556 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
557
558 /*
559  * Returns true for sink states that can't ever transition back to live.
560  */
561 static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
562 {
563         switch (ctrl->state) {
564         case NVME_CTRL_NEW:
565         case NVME_CTRL_LIVE:
566         case NVME_CTRL_RESETTING:
567         case NVME_CTRL_CONNECTING:
568                 return false;
569         case NVME_CTRL_DELETING:
570         case NVME_CTRL_DELETING_NOIO:
571         case NVME_CTRL_DEAD:
572                 return true;
573         default:
574                 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
575                 return true;
576         }
577 }
578
579 /*
580  * Waits for the controller state to be resetting, or returns false if it is
581  * not possible to ever transition to that state.
582  */
583 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
584 {
585         wait_event(ctrl->state_wq,
586                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
587                    nvme_state_terminal(ctrl));
588         return ctrl->state == NVME_CTRL_RESETTING;
589 }
590 EXPORT_SYMBOL_GPL(nvme_wait_reset);
591
592 static void nvme_free_ns_head(struct kref *ref)
593 {
594         struct nvme_ns_head *head =
595                 container_of(ref, struct nvme_ns_head, ref);
596
597         nvme_mpath_remove_disk(head);
598         ida_free(&head->subsys->ns_ida, head->instance);
599         cleanup_srcu_struct(&head->srcu);
600         nvme_put_subsystem(head->subsys);
601         kfree(head);
602 }
603
604 bool nvme_tryget_ns_head(struct nvme_ns_head *head)
605 {
606         return kref_get_unless_zero(&head->ref);
607 }
608
609 void nvme_put_ns_head(struct nvme_ns_head *head)
610 {
611         kref_put(&head->ref, nvme_free_ns_head);
612 }
613
614 static void nvme_free_ns(struct kref *kref)
615 {
616         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
617
618         put_disk(ns->disk);
619         nvme_put_ns_head(ns->head);
620         nvme_put_ctrl(ns->ctrl);
621         kfree(ns);
622 }
623
624 static inline bool nvme_get_ns(struct nvme_ns *ns)
625 {
626         return kref_get_unless_zero(&ns->kref);
627 }
628
629 void nvme_put_ns(struct nvme_ns *ns)
630 {
631         kref_put(&ns->kref, nvme_free_ns);
632 }
633 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
634
635 static inline void nvme_clear_nvme_request(struct request *req)
636 {
637         nvme_req(req)->status = 0;
638         nvme_req(req)->retries = 0;
639         nvme_req(req)->flags = 0;
640         req->rq_flags |= RQF_DONTPREP;
641 }
642
643 /* initialize a passthrough request */
644 void nvme_init_request(struct request *req, struct nvme_command *cmd)
645 {
646         if (req->q->queuedata)
647                 req->timeout = NVME_IO_TIMEOUT;
648         else /* no queuedata implies admin queue */
649                 req->timeout = NVME_ADMIN_TIMEOUT;
650
651         /* passthru commands should let the driver set the SGL flags */
652         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
653
654         req->cmd_flags |= REQ_FAILFAST_DRIVER;
655         if (req->mq_hctx->type == HCTX_TYPE_POLL)
656                 req->cmd_flags |= REQ_POLLED;
657         nvme_clear_nvme_request(req);
658         memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
659 }
660 EXPORT_SYMBOL_GPL(nvme_init_request);
661
662 /*
663  * For something we're not in a state to send to the device the default action
664  * is to busy it and retry it after the controller state is recovered.  However,
665  * if the controller is deleting or if anything is marked for failfast or
666  * nvme multipath it is immediately failed.
667  *
668  * Note: commands used to initialize the controller will be marked for failfast.
669  * Note: nvme cli/ioctl commands are marked for failfast.
670  */
671 blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
672                 struct request *rq)
673 {
674         if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
675             ctrl->state != NVME_CTRL_DELETING &&
676             ctrl->state != NVME_CTRL_DEAD &&
677             !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
678             !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
679                 return BLK_STS_RESOURCE;
680         return nvme_host_path_error(rq);
681 }
682 EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
683
684 bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
685                 bool queue_live)
686 {
687         struct nvme_request *req = nvme_req(rq);
688
689         /*
690          * currently we have a problem sending passthru commands
691          * on the admin_q if the controller is not LIVE because we can't
692          * make sure that they are going out after the admin connect,
693          * controller enable and/or other commands in the initialization
694          * sequence. until the controller will be LIVE, fail with
695          * BLK_STS_RESOURCE so that they will be rescheduled.
696          */
697         if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
698                 return false;
699
700         if (ctrl->ops->flags & NVME_F_FABRICS) {
701                 /*
702                  * Only allow commands on a live queue, except for the connect
703                  * command, which is require to set the queue live in the
704                  * appropinquate states.
705                  */
706                 switch (ctrl->state) {
707                 case NVME_CTRL_CONNECTING:
708                         if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
709                             req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
710                                 return true;
711                         break;
712                 default:
713                         break;
714                 case NVME_CTRL_DEAD:
715                         return false;
716                 }
717         }
718
719         return queue_live;
720 }
721 EXPORT_SYMBOL_GPL(__nvme_check_ready);
722
723 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
724 {
725         struct nvme_command c = { };
726
727         c.directive.opcode = nvme_admin_directive_send;
728         c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
729         c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
730         c.directive.dtype = NVME_DIR_IDENTIFY;
731         c.directive.tdtype = NVME_DIR_STREAMS;
732         c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
733
734         return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
735 }
736
737 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
738 {
739         return nvme_toggle_streams(ctrl, false);
740 }
741
742 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
743 {
744         return nvme_toggle_streams(ctrl, true);
745 }
746
747 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
748                                   struct streams_directive_params *s, u32 nsid)
749 {
750         struct nvme_command c = { };
751
752         memset(s, 0, sizeof(*s));
753
754         c.directive.opcode = nvme_admin_directive_recv;
755         c.directive.nsid = cpu_to_le32(nsid);
756         c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
757         c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
758         c.directive.dtype = NVME_DIR_STREAMS;
759
760         return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
761 }
762
763 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
764 {
765         struct streams_directive_params s;
766         u16 nssa;
767         int ret;
768
769         if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
770                 return 0;
771         if (!streams)
772                 return 0;
773
774         ret = nvme_enable_streams(ctrl);
775         if (ret)
776                 return ret;
777
778         ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
779         if (ret)
780                 goto out_disable_stream;
781
782         nssa = le16_to_cpu(s.nssa);
783         if (nssa < BLK_MAX_WRITE_HINTS - 1) {
784                 dev_info(ctrl->device, "too few streams (%u) available\n",
785                                         nssa);
786                 /* this condition is not an error: streams are optional */
787                 ret = 0;
788                 goto out_disable_stream;
789         }
790
791         ctrl->nr_streams = min_t(u16, nssa, BLK_MAX_WRITE_HINTS - 1);
792         dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
793         return 0;
794
795 out_disable_stream:
796         nvme_disable_streams(ctrl);
797         return ret;
798 }
799
800 /*
801  * Check if 'req' has a write hint associated with it. If it does, assign
802  * a valid namespace stream to the write.
803  */
804 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
805                                      struct request *req, u16 *control,
806                                      u32 *dsmgmt)
807 {
808         enum rw_hint streamid = req->write_hint;
809
810         if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
811                 streamid = 0;
812         else {
813                 streamid--;
814                 if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
815                         return;
816
817                 *control |= NVME_RW_DTYPE_STREAMS;
818                 *dsmgmt |= streamid << 16;
819         }
820
821         if (streamid < ARRAY_SIZE(req->q->write_hints))
822                 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
823 }
824
825 static inline void nvme_setup_flush(struct nvme_ns *ns,
826                 struct nvme_command *cmnd)
827 {
828         memset(cmnd, 0, sizeof(*cmnd));
829         cmnd->common.opcode = nvme_cmd_flush;
830         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
831 }
832
833 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
834                 struct nvme_command *cmnd)
835 {
836         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
837         struct nvme_dsm_range *range;
838         struct bio *bio;
839
840         /*
841          * Some devices do not consider the DSM 'Number of Ranges' field when
842          * determining how much data to DMA. Always allocate memory for maximum
843          * number of segments to prevent device reading beyond end of buffer.
844          */
845         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
846
847         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
848         if (!range) {
849                 /*
850                  * If we fail allocation our range, fallback to the controller
851                  * discard page. If that's also busy, it's safe to return
852                  * busy, as we know we can make progress once that's freed.
853                  */
854                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
855                         return BLK_STS_RESOURCE;
856
857                 range = page_address(ns->ctrl->discard_page);
858         }
859
860         __rq_for_each_bio(bio, req) {
861                 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
862                 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
863
864                 if (n < segments) {
865                         range[n].cattr = cpu_to_le32(0);
866                         range[n].nlb = cpu_to_le32(nlb);
867                         range[n].slba = cpu_to_le64(slba);
868                 }
869                 n++;
870         }
871
872         if (WARN_ON_ONCE(n != segments)) {
873                 if (virt_to_page(range) == ns->ctrl->discard_page)
874                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
875                 else
876                         kfree(range);
877                 return BLK_STS_IOERR;
878         }
879
880         memset(cmnd, 0, sizeof(*cmnd));
881         cmnd->dsm.opcode = nvme_cmd_dsm;
882         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
883         cmnd->dsm.nr = cpu_to_le32(segments - 1);
884         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
885
886         req->special_vec.bv_page = virt_to_page(range);
887         req->special_vec.bv_offset = offset_in_page(range);
888         req->special_vec.bv_len = alloc_size;
889         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
890
891         return BLK_STS_OK;
892 }
893
894 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
895                 struct request *req, struct nvme_command *cmnd)
896 {
897         memset(cmnd, 0, sizeof(*cmnd));
898
899         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
900                 return nvme_setup_discard(ns, req, cmnd);
901
902         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
903         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
904         cmnd->write_zeroes.slba =
905                 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
906         cmnd->write_zeroes.length =
907                 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
908
909         if (nvme_ns_has_pi(ns)) {
910                 cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
911
912                 switch (ns->pi_type) {
913                 case NVME_NS_DPS_PI_TYPE1:
914                 case NVME_NS_DPS_PI_TYPE2:
915                         cmnd->write_zeroes.reftag =
916                                 cpu_to_le32(t10_pi_ref_tag(req));
917                         break;
918                 }
919         }
920
921         return BLK_STS_OK;
922 }
923
924 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
925                 struct request *req, struct nvme_command *cmnd,
926                 enum nvme_opcode op)
927 {
928         struct nvme_ctrl *ctrl = ns->ctrl;
929         u16 control = 0;
930         u32 dsmgmt = 0;
931
932         if (req->cmd_flags & REQ_FUA)
933                 control |= NVME_RW_FUA;
934         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
935                 control |= NVME_RW_LR;
936
937         if (req->cmd_flags & REQ_RAHEAD)
938                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
939
940         cmnd->rw.opcode = op;
941         cmnd->rw.flags = 0;
942         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
943         cmnd->rw.rsvd2 = 0;
944         cmnd->rw.metadata = 0;
945         cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
946         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
947         cmnd->rw.reftag = 0;
948         cmnd->rw.apptag = 0;
949         cmnd->rw.appmask = 0;
950
951         if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
952                 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
953
954         if (ns->ms) {
955                 /*
956                  * If formated with metadata, the block layer always provides a
957                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
958                  * we enable the PRACT bit for protection information or set the
959                  * namespace capacity to zero to prevent any I/O.
960                  */
961                 if (!blk_integrity_rq(req)) {
962                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
963                                 return BLK_STS_NOTSUPP;
964                         control |= NVME_RW_PRINFO_PRACT;
965                 }
966
967                 switch (ns->pi_type) {
968                 case NVME_NS_DPS_PI_TYPE3:
969                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
970                         break;
971                 case NVME_NS_DPS_PI_TYPE1:
972                 case NVME_NS_DPS_PI_TYPE2:
973                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
974                                         NVME_RW_PRINFO_PRCHK_REF;
975                         if (op == nvme_cmd_zone_append)
976                                 control |= NVME_RW_APPEND_PIREMAP;
977                         cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
978                         break;
979                 }
980         }
981
982         cmnd->rw.control = cpu_to_le16(control);
983         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
984         return 0;
985 }
986
987 void nvme_cleanup_cmd(struct request *req)
988 {
989         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
990                 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
991
992                 if (req->special_vec.bv_page == ctrl->discard_page)
993                         clear_bit_unlock(0, &ctrl->discard_page_busy);
994                 else
995                         kfree(bvec_virt(&req->special_vec));
996         }
997 }
998 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
999
1000 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
1001 {
1002         struct nvme_command *cmd = nvme_req(req)->cmd;
1003         blk_status_t ret = BLK_STS_OK;
1004
1005         if (!(req->rq_flags & RQF_DONTPREP))
1006                 nvme_clear_nvme_request(req);
1007
1008         switch (req_op(req)) {
1009         case REQ_OP_DRV_IN:
1010         case REQ_OP_DRV_OUT:
1011                 /* these are setup prior to execution in nvme_init_request() */
1012                 break;
1013         case REQ_OP_FLUSH:
1014                 nvme_setup_flush(ns, cmd);
1015                 break;
1016         case REQ_OP_ZONE_RESET_ALL:
1017         case REQ_OP_ZONE_RESET:
1018                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
1019                 break;
1020         case REQ_OP_ZONE_OPEN:
1021                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
1022                 break;
1023         case REQ_OP_ZONE_CLOSE:
1024                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
1025                 break;
1026         case REQ_OP_ZONE_FINISH:
1027                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
1028                 break;
1029         case REQ_OP_WRITE_ZEROES:
1030                 ret = nvme_setup_write_zeroes(ns, req, cmd);
1031                 break;
1032         case REQ_OP_DISCARD:
1033                 ret = nvme_setup_discard(ns, req, cmd);
1034                 break;
1035         case REQ_OP_READ:
1036                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
1037                 break;
1038         case REQ_OP_WRITE:
1039                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
1040                 break;
1041         case REQ_OP_ZONE_APPEND:
1042                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
1043                 break;
1044         default:
1045                 WARN_ON_ONCE(1);
1046                 return BLK_STS_IOERR;
1047         }
1048
1049         cmd->common.command_id = nvme_cid(req);
1050         trace_nvme_setup_cmd(req, cmd);
1051         return ret;
1052 }
1053 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
1054
1055 /*
1056  * Return values:
1057  * 0:  success
1058  * >0: nvme controller's cqe status response
1059  * <0: kernel error in lieu of controller response
1060  */
1061 static int nvme_execute_rq(struct request *rq, bool at_head)
1062 {
1063         blk_status_t status;
1064
1065         status = blk_execute_rq(rq, at_head);
1066         if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
1067                 return -EINTR;
1068         if (nvme_req(rq)->status)
1069                 return nvme_req(rq)->status;
1070         return blk_status_to_errno(status);
1071 }
1072
1073 /*
1074  * Returns 0 on success.  If the result is negative, it's a Linux error code;
1075  * if the result is positive, it's an NVM Express status code
1076  */
1077 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1078                 union nvme_result *result, void *buffer, unsigned bufflen,
1079                 unsigned timeout, int qid, int at_head,
1080                 blk_mq_req_flags_t flags)
1081 {
1082         struct request *req;
1083         int ret;
1084
1085         if (qid == NVME_QID_ANY)
1086                 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
1087         else
1088                 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
1089                                                 qid ? qid - 1 : 0);
1090
1091         if (IS_ERR(req))
1092                 return PTR_ERR(req);
1093         nvme_init_request(req, cmd);
1094
1095         if (timeout)
1096                 req->timeout = timeout;
1097
1098         if (buffer && bufflen) {
1099                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
1100                 if (ret)
1101                         goto out;
1102         }
1103
1104         ret = nvme_execute_rq(req, at_head);
1105         if (result && ret >= 0)
1106                 *result = nvme_req(req)->result;
1107  out:
1108         blk_mq_free_request(req);
1109         return ret;
1110 }
1111 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1112
1113 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1114                 void *buffer, unsigned bufflen)
1115 {
1116         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
1117                         NVME_QID_ANY, 0, 0);
1118 }
1119 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1120
1121 static u32 nvme_known_admin_effects(u8 opcode)
1122 {
1123         switch (opcode) {
1124         case nvme_admin_format_nvm:
1125                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
1126                         NVME_CMD_EFFECTS_CSE_MASK;
1127         case nvme_admin_sanitize_nvm:
1128                 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
1129         default:
1130                 break;
1131         }
1132         return 0;
1133 }
1134
1135 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1136 {
1137         u32 effects = 0;
1138
1139         if (ns) {
1140                 if (ns->head->effects)
1141                         effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1142                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1143                         dev_warn_once(ctrl->device,
1144                                 "IO command:%02x has unhandled effects:%08x\n",
1145                                 opcode, effects);
1146                 return 0;
1147         }
1148
1149         if (ctrl->effects)
1150                 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1151         effects |= nvme_known_admin_effects(opcode);
1152
1153         return effects;
1154 }
1155 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1156
1157 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1158                                u8 opcode)
1159 {
1160         u32 effects = nvme_command_effects(ctrl, ns, opcode);
1161
1162         /*
1163          * For simplicity, IO to all namespaces is quiesced even if the command
1164          * effects say only one namespace is affected.
1165          */
1166         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1167                 mutex_lock(&ctrl->scan_lock);
1168                 mutex_lock(&ctrl->subsys->lock);
1169                 nvme_mpath_start_freeze(ctrl->subsys);
1170                 nvme_mpath_wait_freeze(ctrl->subsys);
1171                 nvme_start_freeze(ctrl);
1172                 nvme_wait_freeze(ctrl);
1173         }
1174         return effects;
1175 }
1176
1177 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
1178                               struct nvme_command *cmd, int status)
1179 {
1180         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1181                 nvme_unfreeze(ctrl);
1182                 nvme_mpath_unfreeze(ctrl->subsys);
1183                 mutex_unlock(&ctrl->subsys->lock);
1184                 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1185                 mutex_unlock(&ctrl->scan_lock);
1186         }
1187         if (effects & NVME_CMD_EFFECTS_CCC)
1188                 nvme_init_ctrl_finish(ctrl);
1189         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1190                 nvme_queue_scan(ctrl);
1191                 flush_work(&ctrl->scan_work);
1192         }
1193
1194         switch (cmd->common.opcode) {
1195         case nvme_admin_set_features:
1196                 switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
1197                 case NVME_FEAT_KATO:
1198                         /*
1199                          * Keep alive commands interval on the host should be
1200                          * updated when KATO is modified by Set Features
1201                          * commands.
1202                          */
1203                         if (!status)
1204                                 nvme_update_keep_alive(ctrl, cmd);
1205                         break;
1206                 default:
1207                         break;
1208                 }
1209                 break;
1210         default:
1211                 break;
1212         }
1213 }
1214
1215 int nvme_execute_passthru_rq(struct request *rq)
1216 {
1217         struct nvme_command *cmd = nvme_req(rq)->cmd;
1218         struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1219         struct nvme_ns *ns = rq->q->queuedata;
1220         u32 effects;
1221         int  ret;
1222
1223         effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1224         ret = nvme_execute_rq(rq, false);
1225         if (effects) /* nothing to be done for zero cmd effects */
1226                 nvme_passthru_end(ctrl, effects, cmd, ret);
1227
1228         return ret;
1229 }
1230 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1231
1232 /*
1233  * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1234  * 
1235  *   The host should send Keep Alive commands at half of the Keep Alive Timeout
1236  *   accounting for transport roundtrip times [..].
1237  */
1238 static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1239 {
1240         queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
1241 }
1242
1243 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
1244 {
1245         struct nvme_ctrl *ctrl = rq->end_io_data;
1246         unsigned long flags;
1247         bool startka = false;
1248
1249         blk_mq_free_request(rq);
1250
1251         if (status) {
1252                 dev_err(ctrl->device,
1253                         "failed nvme_keep_alive_end_io error=%d\n",
1254                                 status);
1255                 return;
1256         }
1257
1258         ctrl->comp_seen = false;
1259         spin_lock_irqsave(&ctrl->lock, flags);
1260         if (ctrl->state == NVME_CTRL_LIVE ||
1261             ctrl->state == NVME_CTRL_CONNECTING)
1262                 startka = true;
1263         spin_unlock_irqrestore(&ctrl->lock, flags);
1264         if (startka)
1265                 nvme_queue_keep_alive_work(ctrl);
1266 }
1267
1268 static void nvme_keep_alive_work(struct work_struct *work)
1269 {
1270         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1271                         struct nvme_ctrl, ka_work);
1272         bool comp_seen = ctrl->comp_seen;
1273         struct request *rq;
1274
1275         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1276                 dev_dbg(ctrl->device,
1277                         "reschedule traffic based keep-alive timer\n");
1278                 ctrl->comp_seen = false;
1279                 nvme_queue_keep_alive_work(ctrl);
1280                 return;
1281         }
1282
1283         rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
1284                                   BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1285         if (IS_ERR(rq)) {
1286                 /* allocation failure, reset the controller */
1287                 dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1288                 nvme_reset_ctrl(ctrl);
1289                 return;
1290         }
1291         nvme_init_request(rq, &ctrl->ka_cmd);
1292
1293         rq->timeout = ctrl->kato * HZ;
1294         rq->end_io_data = ctrl;
1295         blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io);
1296 }
1297
1298 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1299 {
1300         if (unlikely(ctrl->kato == 0))
1301                 return;
1302
1303         nvme_queue_keep_alive_work(ctrl);
1304 }
1305
1306 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1307 {
1308         if (unlikely(ctrl->kato == 0))
1309                 return;
1310
1311         cancel_delayed_work_sync(&ctrl->ka_work);
1312 }
1313 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1314
1315 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1316                                    struct nvme_command *cmd)
1317 {
1318         unsigned int new_kato =
1319                 DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
1320
1321         dev_info(ctrl->device,
1322                  "keep alive interval updated from %u ms to %u ms\n",
1323                  ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
1324
1325         nvme_stop_keep_alive(ctrl);
1326         ctrl->kato = new_kato;
1327         nvme_start_keep_alive(ctrl);
1328 }
1329
1330 /*
1331  * In NVMe 1.0 the CNS field was just a binary controller or namespace
1332  * flag, thus sending any new CNS opcodes has a big chance of not working.
1333  * Qemu unfortunately had that bug after reporting a 1.1 version compliance
1334  * (but not for any later version).
1335  */
1336 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
1337 {
1338         if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
1339                 return ctrl->vs < NVME_VS(1, 2, 0);
1340         return ctrl->vs < NVME_VS(1, 1, 0);
1341 }
1342
1343 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1344 {
1345         struct nvme_command c = { };
1346         int error;
1347
1348         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1349         c.identify.opcode = nvme_admin_identify;
1350         c.identify.cns = NVME_ID_CNS_CTRL;
1351
1352         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1353         if (!*id)
1354                 return -ENOMEM;
1355
1356         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1357                         sizeof(struct nvme_id_ctrl));
1358         if (error)
1359                 kfree(*id);
1360         return error;
1361 }
1362
1363 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1364                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1365 {
1366         const char *warn_str = "ctrl returned bogus length:";
1367         void *data = cur;
1368
1369         switch (cur->nidt) {
1370         case NVME_NIDT_EUI64:
1371                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1372                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1373                                  warn_str, cur->nidl);
1374                         return -1;
1375                 }
1376                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1377                 return NVME_NIDT_EUI64_LEN;
1378         case NVME_NIDT_NGUID:
1379                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1380                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1381                                  warn_str, cur->nidl);
1382                         return -1;
1383                 }
1384                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1385                 return NVME_NIDT_NGUID_LEN;
1386         case NVME_NIDT_UUID:
1387                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1388                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1389                                  warn_str, cur->nidl);
1390                         return -1;
1391                 }
1392                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1393                 return NVME_NIDT_UUID_LEN;
1394         case NVME_NIDT_CSI:
1395                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1396                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1397                                  warn_str, cur->nidl);
1398                         return -1;
1399                 }
1400                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1401                 *csi_seen = true;
1402                 return NVME_NIDT_CSI_LEN;
1403         default:
1404                 /* Skip unknown types */
1405                 return cur->nidl;
1406         }
1407 }
1408
1409 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
1410                 struct nvme_ns_ids *ids)
1411 {
1412         struct nvme_command c = { };
1413         bool csi_seen = false;
1414         int status, pos, len;
1415         void *data;
1416
1417         if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1418                 return 0;
1419         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1420                 return 0;
1421
1422         c.identify.opcode = nvme_admin_identify;
1423         c.identify.nsid = cpu_to_le32(nsid);
1424         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1425
1426         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1427         if (!data)
1428                 return -ENOMEM;
1429
1430         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1431                                       NVME_IDENTIFY_DATA_SIZE);
1432         if (status) {
1433                 dev_warn(ctrl->device,
1434                         "Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1435                         nsid, status);
1436                 goto free_data;
1437         }
1438
1439         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1440                 struct nvme_ns_id_desc *cur = data + pos;
1441
1442                 if (cur->nidl == 0)
1443                         break;
1444
1445                 len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
1446                 if (len < 0)
1447                         break;
1448
1449                 len += sizeof(*cur);
1450         }
1451
1452         if (nvme_multi_css(ctrl) && !csi_seen) {
1453                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1454                          nsid);
1455                 status = -EINVAL;
1456         }
1457
1458 free_data:
1459         kfree(data);
1460         return status;
1461 }
1462
1463 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1464                         struct nvme_ns_ids *ids, struct nvme_id_ns **id)
1465 {
1466         struct nvme_command c = { };
1467         int error;
1468
1469         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1470         c.identify.opcode = nvme_admin_identify;
1471         c.identify.nsid = cpu_to_le32(nsid);
1472         c.identify.cns = NVME_ID_CNS_NS;
1473
1474         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1475         if (!*id)
1476                 return -ENOMEM;
1477
1478         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1479         if (error) {
1480                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1481                 goto out_free_id;
1482         }
1483
1484         error = NVME_SC_INVALID_NS | NVME_SC_DNR;
1485         if ((*id)->ncap == 0) /* namespace not allocated or attached */
1486                 goto out_free_id;
1487
1488         if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1489             !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1490                 memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
1491         if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1492             !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1493                 memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
1494
1495         return 0;
1496
1497 out_free_id:
1498         kfree(*id);
1499         return error;
1500 }
1501
1502 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1503                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1504 {
1505         union nvme_result res = { 0 };
1506         struct nvme_command c = { };
1507         int ret;
1508
1509         c.features.opcode = op;
1510         c.features.fid = cpu_to_le32(fid);
1511         c.features.dword11 = cpu_to_le32(dword11);
1512
1513         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1514                         buffer, buflen, 0, NVME_QID_ANY, 0, 0);
1515         if (ret >= 0 && result)
1516                 *result = le32_to_cpu(res.u32);
1517         return ret;
1518 }
1519
1520 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1521                       unsigned int dword11, void *buffer, size_t buflen,
1522                       u32 *result)
1523 {
1524         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1525                              buflen, result);
1526 }
1527 EXPORT_SYMBOL_GPL(nvme_set_features);
1528
1529 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1530                       unsigned int dword11, void *buffer, size_t buflen,
1531                       u32 *result)
1532 {
1533         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1534                              buflen, result);
1535 }
1536 EXPORT_SYMBOL_GPL(nvme_get_features);
1537
1538 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1539 {
1540         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1541         u32 result;
1542         int status, nr_io_queues;
1543
1544         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1545                         &result);
1546         if (status < 0)
1547                 return status;
1548
1549         /*
1550          * Degraded controllers might return an error when setting the queue
1551          * count.  We still want to be able to bring them online and offer
1552          * access to the admin queue, as that might be only way to fix them up.
1553          */
1554         if (status > 0) {
1555                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1556                 *count = 0;
1557         } else {
1558                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1559                 *count = min(*count, nr_io_queues);
1560         }
1561
1562         return 0;
1563 }
1564 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1565
1566 #define NVME_AEN_SUPPORTED \
1567         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1568          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1569
1570 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1571 {
1572         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1573         int status;
1574
1575         if (!supported_aens)
1576                 return;
1577
1578         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1579                         NULL, 0, &result);
1580         if (status)
1581                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1582                          supported_aens);
1583
1584         queue_work(nvme_wq, &ctrl->async_event_work);
1585 }
1586
1587 static int nvme_ns_open(struct nvme_ns *ns)
1588 {
1589
1590         /* should never be called due to GENHD_FL_HIDDEN */
1591         if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1592                 goto fail;
1593         if (!nvme_get_ns(ns))
1594                 goto fail;
1595         if (!try_module_get(ns->ctrl->ops->module))
1596                 goto fail_put_ns;
1597
1598         return 0;
1599
1600 fail_put_ns:
1601         nvme_put_ns(ns);
1602 fail:
1603         return -ENXIO;
1604 }
1605
1606 static void nvme_ns_release(struct nvme_ns *ns)
1607 {
1608
1609         module_put(ns->ctrl->ops->module);
1610         nvme_put_ns(ns);
1611 }
1612
1613 static int nvme_open(struct block_device *bdev, fmode_t mode)
1614 {
1615         return nvme_ns_open(bdev->bd_disk->private_data);
1616 }
1617
1618 static void nvme_release(struct gendisk *disk, fmode_t mode)
1619 {
1620         nvme_ns_release(disk->private_data);
1621 }
1622
1623 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1624 {
1625         /* some standard values */
1626         geo->heads = 1 << 6;
1627         geo->sectors = 1 << 5;
1628         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1629         return 0;
1630 }
1631
1632 #ifdef CONFIG_BLK_DEV_INTEGRITY
1633 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1634                                 u32 max_integrity_segments)
1635 {
1636         struct blk_integrity integrity = { };
1637
1638         switch (pi_type) {
1639         case NVME_NS_DPS_PI_TYPE3:
1640                 integrity.profile = &t10_pi_type3_crc;
1641                 integrity.tag_size = sizeof(u16) + sizeof(u32);
1642                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1643                 break;
1644         case NVME_NS_DPS_PI_TYPE1:
1645         case NVME_NS_DPS_PI_TYPE2:
1646                 integrity.profile = &t10_pi_type1_crc;
1647                 integrity.tag_size = sizeof(u16);
1648                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1649                 break;
1650         default:
1651                 integrity.profile = NULL;
1652                 break;
1653         }
1654         integrity.tuple_size = ms;
1655         blk_integrity_register(disk, &integrity);
1656         blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
1657 }
1658 #else
1659 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1660                                 u32 max_integrity_segments)
1661 {
1662 }
1663 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1664
1665 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
1666 {
1667         struct nvme_ctrl *ctrl = ns->ctrl;
1668         struct request_queue *queue = disk->queue;
1669         u32 size = queue_logical_block_size(queue);
1670
1671         if (ctrl->max_discard_sectors == 0) {
1672                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
1673                 return;
1674         }
1675
1676         if (ctrl->nr_streams && ns->sws && ns->sgs)
1677                 size *= ns->sws * ns->sgs;
1678
1679         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1680                         NVME_DSM_MAX_RANGES);
1681
1682         queue->limits.discard_alignment = 0;
1683         queue->limits.discard_granularity = size;
1684
1685         /* If discard is already enabled, don't reset queue limits */
1686         if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
1687                 return;
1688
1689         blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
1690         blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
1691
1692         if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1693                 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1694 }
1695
1696 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1697 {
1698         return uuid_equal(&a->uuid, &b->uuid) &&
1699                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1700                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1701                 a->csi == b->csi;
1702 }
1703
1704 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1705                                  u32 *phys_bs, u32 *io_opt)
1706 {
1707         struct streams_directive_params s;
1708         int ret;
1709
1710         if (!ctrl->nr_streams)
1711                 return 0;
1712
1713         ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
1714         if (ret)
1715                 return ret;
1716
1717         ns->sws = le32_to_cpu(s.sws);
1718         ns->sgs = le16_to_cpu(s.sgs);
1719
1720         if (ns->sws) {
1721                 *phys_bs = ns->sws * (1 << ns->lba_shift);
1722                 if (ns->sgs)
1723                         *io_opt = *phys_bs * ns->sgs;
1724         }
1725
1726         return 0;
1727 }
1728
1729 static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
1730 {
1731         struct nvme_ctrl *ctrl = ns->ctrl;
1732
1733         /*
1734          * The PI implementation requires the metadata size to be equal to the
1735          * t10 pi tuple size.
1736          */
1737         ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1738         if (ns->ms == sizeof(struct t10_pi_tuple))
1739                 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1740         else
1741                 ns->pi_type = 0;
1742
1743         ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1744         if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1745                 return;
1746
1747         if (ctrl->ops->flags & NVME_F_FABRICS) {
1748                 /*
1749                  * The NVMe over Fabrics specification only supports metadata as
1750                  * part of the extended data LBA.  We rely on HCA/HBA support to
1751                  * remap the separate metadata buffer from the block layer.
1752                  */
1753                 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1754                         return;
1755
1756                 ns->features |= NVME_NS_EXT_LBAS;
1757
1758                 /*
1759                  * The current fabrics transport drivers support namespace
1760                  * metadata formats only if nvme_ns_has_pi() returns true.
1761                  * Suppress support for all other formats so the namespace will
1762                  * have a 0 capacity and not be usable through the block stack.
1763                  *
1764                  * Note, this check will need to be modified if any drivers
1765                  * gain the ability to use other metadata formats.
1766                  */
1767                 if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
1768                         ns->features |= NVME_NS_METADATA_SUPPORTED;
1769         } else {
1770                 /*
1771                  * For PCIe controllers, we can't easily remap the separate
1772                  * metadata buffer from the block layer and thus require a
1773                  * separate metadata buffer for block layer metadata/PI support.
1774                  * We allow extended LBAs for the passthrough interface, though.
1775                  */
1776                 if (id->flbas & NVME_NS_FLBAS_META_EXT)
1777                         ns->features |= NVME_NS_EXT_LBAS;
1778                 else
1779                         ns->features |= NVME_NS_METADATA_SUPPORTED;
1780         }
1781 }
1782
1783 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1784                 struct request_queue *q)
1785 {
1786         bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
1787
1788         if (ctrl->max_hw_sectors) {
1789                 u32 max_segments =
1790                         (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
1791
1792                 max_segments = min_not_zero(max_segments, ctrl->max_segments);
1793                 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1794                 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1795         }
1796         blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
1797         blk_queue_dma_alignment(q, 7);
1798         blk_queue_write_cache(q, vwc, vwc);
1799 }
1800
1801 static void nvme_update_disk_info(struct gendisk *disk,
1802                 struct nvme_ns *ns, struct nvme_id_ns *id)
1803 {
1804         sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
1805         unsigned short bs = 1 << ns->lba_shift;
1806         u32 atomic_bs, phys_bs, io_opt = 0;
1807
1808         /*
1809          * The block layer can't support LBA sizes larger than the page size
1810          * yet, so catch this early and don't allow block I/O.
1811          */
1812         if (ns->lba_shift > PAGE_SHIFT) {
1813                 capacity = 0;
1814                 bs = (1 << 9);
1815         }
1816
1817         blk_integrity_unregister(disk);
1818
1819         atomic_bs = phys_bs = bs;
1820         nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
1821         if (id->nabo == 0) {
1822                 /*
1823                  * Bit 1 indicates whether NAWUPF is defined for this namespace
1824                  * and whether it should be used instead of AWUPF. If NAWUPF ==
1825                  * 0 then AWUPF must be used instead.
1826                  */
1827                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
1828                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
1829                 else
1830                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
1831         }
1832
1833         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
1834                 /* NPWG = Namespace Preferred Write Granularity */
1835                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
1836                 /* NOWS = Namespace Optimal Write Size */
1837                 io_opt = bs * (1 + le16_to_cpu(id->nows));
1838         }
1839
1840         blk_queue_logical_block_size(disk->queue, bs);
1841         /*
1842          * Linux filesystems assume writing a single physical block is
1843          * an atomic operation. Hence limit the physical block size to the
1844          * value of the Atomic Write Unit Power Fail parameter.
1845          */
1846         blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
1847         blk_queue_io_min(disk->queue, phys_bs);
1848         blk_queue_io_opt(disk->queue, io_opt);
1849
1850         /*
1851          * Register a metadata profile for PI, or the plain non-integrity NVMe
1852          * metadata masquerading as Type 0 if supported, otherwise reject block
1853          * I/O to namespaces with metadata except when the namespace supports
1854          * PI, as it can strip/insert in that case.
1855          */
1856         if (ns->ms) {
1857                 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
1858                     (ns->features & NVME_NS_METADATA_SUPPORTED))
1859                         nvme_init_integrity(disk, ns->ms, ns->pi_type,
1860                                             ns->ctrl->max_integrity_segments);
1861                 else if (!nvme_ns_has_pi(ns))
1862                         capacity = 0;
1863         }
1864
1865         set_capacity_and_notify(disk, capacity);
1866
1867         nvme_config_discard(disk, ns);
1868         blk_queue_max_write_zeroes_sectors(disk->queue,
1869                                            ns->ctrl->max_zeroes_sectors);
1870
1871         set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||
1872                 test_bit(NVME_NS_FORCE_RO, &ns->flags));
1873 }
1874
1875 static inline bool nvme_first_scan(struct gendisk *disk)
1876 {
1877         /* nvme_alloc_ns() scans the disk prior to adding it */
1878         return !disk_live(disk);
1879 }
1880
1881 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
1882 {
1883         struct nvme_ctrl *ctrl = ns->ctrl;
1884         u32 iob;
1885
1886         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1887             is_power_of_2(ctrl->max_hw_sectors))
1888                 iob = ctrl->max_hw_sectors;
1889         else
1890                 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
1891
1892         if (!iob)
1893                 return;
1894
1895         if (!is_power_of_2(iob)) {
1896                 if (nvme_first_scan(ns->disk))
1897                         pr_warn("%s: ignoring unaligned IO boundary:%u\n",
1898                                 ns->disk->disk_name, iob);
1899                 return;
1900         }
1901
1902         if (blk_queue_is_zoned(ns->disk->queue)) {
1903                 if (nvme_first_scan(ns->disk))
1904                         pr_warn("%s: ignoring zoned namespace IO boundary\n",
1905                                 ns->disk->disk_name);
1906                 return;
1907         }
1908
1909         blk_queue_chunk_sectors(ns->queue, iob);
1910 }
1911
1912 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
1913 {
1914         unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
1915         int ret;
1916
1917         blk_mq_freeze_queue(ns->disk->queue);
1918         ns->lba_shift = id->lbaf[lbaf].ds;
1919         nvme_set_queue_limits(ns->ctrl, ns->queue);
1920
1921         nvme_configure_metadata(ns, id);
1922         nvme_set_chunk_sectors(ns, id);
1923         nvme_update_disk_info(ns->disk, ns, id);
1924
1925         if (ns->head->ids.csi == NVME_CSI_ZNS) {
1926                 ret = nvme_update_zone_info(ns, lbaf);
1927                 if (ret)
1928                         goto out_unfreeze;
1929         }
1930
1931         set_bit(NVME_NS_READY, &ns->flags);
1932         blk_mq_unfreeze_queue(ns->disk->queue);
1933
1934         if (blk_queue_is_zoned(ns->queue)) {
1935                 ret = nvme_revalidate_zones(ns);
1936                 if (ret && !nvme_first_scan(ns->disk))
1937                         return ret;
1938         }
1939
1940         if (nvme_ns_head_multipath(ns->head)) {
1941                 blk_mq_freeze_queue(ns->head->disk->queue);
1942                 nvme_update_disk_info(ns->head->disk, ns, id);
1943                 nvme_mpath_revalidate_paths(ns);
1944                 blk_stack_limits(&ns->head->disk->queue->limits,
1945                                  &ns->queue->limits, 0);
1946                 disk_update_readahead(ns->head->disk);
1947                 blk_mq_unfreeze_queue(ns->head->disk->queue);
1948         }
1949         return 0;
1950
1951 out_unfreeze:
1952         /*
1953          * If probing fails due an unsupported feature, hide the block device,
1954          * but still allow other access.
1955          */
1956         if (ret == -ENODEV) {
1957                 ns->disk->flags |= GENHD_FL_HIDDEN;
1958                 set_bit(NVME_NS_READY, &ns->flags);
1959                 ret = 0;
1960         }
1961         blk_mq_unfreeze_queue(ns->disk->queue);
1962         return ret;
1963 }
1964
1965 static char nvme_pr_type(enum pr_type type)
1966 {
1967         switch (type) {
1968         case PR_WRITE_EXCLUSIVE:
1969                 return 1;
1970         case PR_EXCLUSIVE_ACCESS:
1971                 return 2;
1972         case PR_WRITE_EXCLUSIVE_REG_ONLY:
1973                 return 3;
1974         case PR_EXCLUSIVE_ACCESS_REG_ONLY:
1975                 return 4;
1976         case PR_WRITE_EXCLUSIVE_ALL_REGS:
1977                 return 5;
1978         case PR_EXCLUSIVE_ACCESS_ALL_REGS:
1979                 return 6;
1980         default:
1981                 return 0;
1982         }
1983 }
1984
1985 static int nvme_send_ns_head_pr_command(struct block_device *bdev,
1986                 struct nvme_command *c, u8 data[16])
1987 {
1988         struct nvme_ns_head *head = bdev->bd_disk->private_data;
1989         int srcu_idx = srcu_read_lock(&head->srcu);
1990         struct nvme_ns *ns = nvme_find_path(head);
1991         int ret = -EWOULDBLOCK;
1992
1993         if (ns) {
1994                 c->common.nsid = cpu_to_le32(ns->head->ns_id);
1995                 ret = nvme_submit_sync_cmd(ns->queue, c, data, 16);
1996         }
1997         srcu_read_unlock(&head->srcu, srcu_idx);
1998         return ret;
1999 }
2000         
2001 static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
2002                 u8 data[16])
2003 {
2004         c->common.nsid = cpu_to_le32(ns->head->ns_id);
2005         return nvme_submit_sync_cmd(ns->queue, c, data, 16);
2006 }
2007
2008 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
2009                                 u64 key, u64 sa_key, u8 op)
2010 {
2011         struct nvme_command c = { };
2012         u8 data[16] = { 0, };
2013
2014         put_unaligned_le64(key, &data[0]);
2015         put_unaligned_le64(sa_key, &data[8]);
2016
2017         c.common.opcode = op;
2018         c.common.cdw10 = cpu_to_le32(cdw10);
2019
2020         if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
2021             bdev->bd_disk->fops == &nvme_ns_head_ops)
2022                 return nvme_send_ns_head_pr_command(bdev, &c, data);
2023         return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, data);
2024 }
2025
2026 static int nvme_pr_register(struct block_device *bdev, u64 old,
2027                 u64 new, unsigned flags)
2028 {
2029         u32 cdw10;
2030
2031         if (flags & ~PR_FL_IGNORE_KEY)
2032                 return -EOPNOTSUPP;
2033
2034         cdw10 = old ? 2 : 0;
2035         cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
2036         cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
2037         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
2038 }
2039
2040 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
2041                 enum pr_type type, unsigned flags)
2042 {
2043         u32 cdw10;
2044
2045         if (flags & ~PR_FL_IGNORE_KEY)
2046                 return -EOPNOTSUPP;
2047
2048         cdw10 = nvme_pr_type(type) << 8;
2049         cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
2050         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
2051 }
2052
2053 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
2054                 enum pr_type type, bool abort)
2055 {
2056         u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
2057
2058         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
2059 }
2060
2061 static int nvme_pr_clear(struct block_device *bdev, u64 key)
2062 {
2063         u32 cdw10 = 1 | (key ? 1 << 3 : 0);
2064
2065         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
2066 }
2067
2068 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2069 {
2070         u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
2071
2072         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2073 }
2074
2075 const struct pr_ops nvme_pr_ops = {
2076         .pr_register    = nvme_pr_register,
2077         .pr_reserve     = nvme_pr_reserve,
2078         .pr_release     = nvme_pr_release,
2079         .pr_preempt     = nvme_pr_preempt,
2080         .pr_clear       = nvme_pr_clear,
2081 };
2082
2083 #ifdef CONFIG_BLK_SED_OPAL
2084 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2085                 bool send)
2086 {
2087         struct nvme_ctrl *ctrl = data;
2088         struct nvme_command cmd = { };
2089
2090         if (send)
2091                 cmd.common.opcode = nvme_admin_security_send;
2092         else
2093                 cmd.common.opcode = nvme_admin_security_recv;
2094         cmd.common.nsid = 0;
2095         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2096         cmd.common.cdw11 = cpu_to_le32(len);
2097
2098         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
2099                         NVME_QID_ANY, 1, 0);
2100 }
2101 EXPORT_SYMBOL_GPL(nvme_sec_submit);
2102 #endif /* CONFIG_BLK_SED_OPAL */
2103
2104 #ifdef CONFIG_BLK_DEV_ZONED
2105 static int nvme_report_zones(struct gendisk *disk, sector_t sector,
2106                 unsigned int nr_zones, report_zones_cb cb, void *data)
2107 {
2108         return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
2109                         data);
2110 }
2111 #else
2112 #define nvme_report_zones       NULL
2113 #endif /* CONFIG_BLK_DEV_ZONED */
2114
2115 static const struct block_device_operations nvme_bdev_ops = {
2116         .owner          = THIS_MODULE,
2117         .ioctl          = nvme_ioctl,
2118         .open           = nvme_open,
2119         .release        = nvme_release,
2120         .getgeo         = nvme_getgeo,
2121         .report_zones   = nvme_report_zones,
2122         .pr_ops         = &nvme_pr_ops,
2123 };
2124
2125 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
2126 {
2127         unsigned long timeout =
2128                 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
2129         u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
2130         int ret;
2131
2132         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2133                 if (csts == ~0)
2134                         return -ENODEV;
2135                 if ((csts & NVME_CSTS_RDY) == bit)
2136                         break;
2137
2138                 usleep_range(1000, 2000);
2139                 if (fatal_signal_pending(current))
2140                         return -EINTR;
2141                 if (time_after(jiffies, timeout)) {
2142                         dev_err(ctrl->device,
2143                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2144                                 enabled ? "initialisation" : "reset", csts);
2145                         return -ENODEV;
2146                 }
2147         }
2148
2149         return ret;
2150 }
2151
2152 /*
2153  * If the device has been passed off to us in an enabled state, just clear
2154  * the enabled bit.  The spec says we should set the 'shutdown notification
2155  * bits', but doing so may cause the device to complete commands to the
2156  * admin queue ... and we don't know what memory that might be pointing at!
2157  */
2158 int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
2159 {
2160         int ret;
2161
2162         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2163         ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2164
2165         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2166         if (ret)
2167                 return ret;
2168
2169         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2170                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2171
2172         return nvme_wait_ready(ctrl, ctrl->cap, false);
2173 }
2174 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2175
2176 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2177 {
2178         unsigned dev_page_min;
2179         int ret;
2180
2181         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2182         if (ret) {
2183                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2184                 return ret;
2185         }
2186         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2187
2188         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2189                 dev_err(ctrl->device,
2190                         "Minimum device page size %u too large for host (%u)\n",
2191                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2192                 return -ENODEV;
2193         }
2194
2195         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2196                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2197         else
2198                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2199         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2200         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2201         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2202         ctrl->ctrl_config |= NVME_CC_ENABLE;
2203
2204         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2205         if (ret)
2206                 return ret;
2207         return nvme_wait_ready(ctrl, ctrl->cap, true);
2208 }
2209 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2210
2211 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
2212 {
2213         unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
2214         u32 csts;
2215         int ret;
2216
2217         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2218         ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2219
2220         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2221         if (ret)
2222                 return ret;
2223
2224         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2225                 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
2226                         break;
2227
2228                 msleep(100);
2229                 if (fatal_signal_pending(current))
2230                         return -EINTR;
2231                 if (time_after(jiffies, timeout)) {
2232                         dev_err(ctrl->device,
2233                                 "Device shutdown incomplete; abort shutdown\n");
2234                         return -ENODEV;
2235                 }
2236         }
2237
2238         return ret;
2239 }
2240 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
2241
2242 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2243 {
2244         __le64 ts;
2245         int ret;
2246
2247         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2248                 return 0;
2249
2250         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2251         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2252                         NULL);
2253         if (ret)
2254                 dev_warn_once(ctrl->device,
2255                         "could not set timestamp (%d)\n", ret);
2256         return ret;
2257 }
2258
2259 static int nvme_configure_acre(struct nvme_ctrl *ctrl)
2260 {
2261         struct nvme_feat_host_behavior *host;
2262         int ret;
2263
2264         /* Don't bother enabling the feature if retry delay is not reported */
2265         if (!ctrl->crdt[0])
2266                 return 0;
2267
2268         host = kzalloc(sizeof(*host), GFP_KERNEL);
2269         if (!host)
2270                 return 0;
2271
2272         host->acre = NVME_ENABLE_ACRE;
2273         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2274                                 host, sizeof(*host), NULL);
2275         kfree(host);
2276         return ret;
2277 }
2278
2279 /*
2280  * The function checks whether the given total (exlat + enlat) latency of
2281  * a power state allows the latter to be used as an APST transition target.
2282  * It does so by comparing the latency to the primary and secondary latency
2283  * tolerances defined by module params. If there's a match, the corresponding
2284  * timeout value is returned and the matching tolerance index (1 or 2) is
2285  * reported.
2286  */
2287 static bool nvme_apst_get_transition_time(u64 total_latency,
2288                 u64 *transition_time, unsigned *last_index)
2289 {
2290         if (total_latency <= apst_primary_latency_tol_us) {
2291                 if (*last_index == 1)
2292                         return false;
2293                 *last_index = 1;
2294                 *transition_time = apst_primary_timeout_ms;
2295                 return true;
2296         }
2297         if (apst_secondary_timeout_ms &&
2298                 total_latency <= apst_secondary_latency_tol_us) {
2299                 if (*last_index <= 2)
2300                         return false;
2301                 *last_index = 2;
2302                 *transition_time = apst_secondary_timeout_ms;
2303                 return true;
2304         }
2305         return false;
2306 }
2307
2308 /*
2309  * APST (Autonomous Power State Transition) lets us program a table of power
2310  * state transitions that the controller will perform automatically.
2311  *
2312  * Depending on module params, one of the two supported techniques will be used:
2313  *
2314  * - If the parameters provide explicit timeouts and tolerances, they will be
2315  *   used to build a table with up to 2 non-operational states to transition to.
2316  *   The default parameter values were selected based on the values used by
2317  *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2318  *   regeneration of the APST table in the event of switching between external
2319  *   and battery power, the timeouts and tolerances reflect a compromise
2320  *   between values used by Microsoft for AC and battery scenarios.
2321  * - If not, we'll configure the table with a simple heuristic: we are willing
2322  *   to spend at most 2% of the time transitioning between power states.
2323  *   Therefore, when running in any given state, we will enter the next
2324  *   lower-power non-operational state after waiting 50 * (enlat + exlat)
2325  *   microseconds, as long as that state's exit latency is under the requested
2326  *   maximum latency.
2327  *
2328  * We will not autonomously enter any non-operational state for which the total
2329  * latency exceeds ps_max_latency_us.
2330  *
2331  * Users can set ps_max_latency_us to zero to turn off APST.
2332  */
2333 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2334 {
2335         struct nvme_feat_auto_pst *table;
2336         unsigned apste = 0;
2337         u64 max_lat_us = 0;
2338         __le64 target = 0;
2339         int max_ps = -1;
2340         int state;
2341         int ret;
2342         unsigned last_lt_index = UINT_MAX;
2343
2344         /*
2345          * If APST isn't supported or if we haven't been initialized yet,
2346          * then don't do anything.
2347          */
2348         if (!ctrl->apsta)
2349                 return 0;
2350
2351         if (ctrl->npss > 31) {
2352                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2353                 return 0;
2354         }
2355
2356         table = kzalloc(sizeof(*table), GFP_KERNEL);
2357         if (!table)
2358                 return 0;
2359
2360         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2361                 /* Turn off APST. */
2362                 dev_dbg(ctrl->device, "APST disabled\n");
2363                 goto done;
2364         }
2365
2366         /*
2367          * Walk through all states from lowest- to highest-power.
2368          * According to the spec, lower-numbered states use more power.  NPSS,
2369          * despite the name, is the index of the lowest-power state, not the
2370          * number of states.
2371          */
2372         for (state = (int)ctrl->npss; state >= 0; state--) {
2373                 u64 total_latency_us, exit_latency_us, transition_ms;
2374
2375                 if (target)
2376                         table->entries[state] = target;
2377
2378                 /*
2379                  * Don't allow transitions to the deepest state if it's quirked
2380                  * off.
2381                  */
2382                 if (state == ctrl->npss &&
2383                     (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2384                         continue;
2385
2386                 /*
2387                  * Is this state a useful non-operational state for higher-power
2388                  * states to autonomously transition to?
2389                  */
2390                 if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2391                         continue;
2392
2393                 exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2394                 if (exit_latency_us > ctrl->ps_max_latency_us)
2395                         continue;
2396
2397                 total_latency_us = exit_latency_us +
2398                         le32_to_cpu(ctrl->psd[state].entry_lat);
2399
2400                 /*
2401                  * This state is good. It can be used as the APST idle target
2402                  * for higher power states.
2403                  */
2404                 if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2405                         if (!nvme_apst_get_transition_time(total_latency_us,
2406                                         &transition_ms, &last_lt_index))
2407                                 continue;
2408                 } else {
2409                         transition_ms = total_latency_us + 19;
2410                         do_div(transition_ms, 20);
2411                         if (transition_ms > (1 << 24) - 1)
2412                                 transition_ms = (1 << 24) - 1;
2413                 }
2414
2415                 target = cpu_to_le64((state << 3) | (transition_ms << 8));
2416                 if (max_ps == -1)
2417                         max_ps = state;
2418                 if (total_latency_us > max_lat_us)
2419                         max_lat_us = total_latency_us;
2420         }
2421
2422         if (max_ps == -1)
2423                 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2424         else
2425                 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2426                         max_ps, max_lat_us, (int)sizeof(*table), table);
2427         apste = 1;
2428
2429 done:
2430         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2431                                 table, sizeof(*table), NULL);
2432         if (ret)
2433                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2434         kfree(table);
2435         return ret;
2436 }
2437
2438 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2439 {
2440         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2441         u64 latency;
2442
2443         switch (val) {
2444         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2445         case PM_QOS_LATENCY_ANY:
2446                 latency = U64_MAX;
2447                 break;
2448
2449         default:
2450                 latency = val;
2451         }
2452
2453         if (ctrl->ps_max_latency_us != latency) {
2454                 ctrl->ps_max_latency_us = latency;
2455                 if (ctrl->state == NVME_CTRL_LIVE)
2456                         nvme_configure_apst(ctrl);
2457         }
2458 }
2459
2460 struct nvme_core_quirk_entry {
2461         /*
2462          * NVMe model and firmware strings are padded with spaces.  For
2463          * simplicity, strings in the quirk table are padded with NULLs
2464          * instead.
2465          */
2466         u16 vid;
2467         const char *mn;
2468         const char *fr;
2469         unsigned long quirks;
2470 };
2471
2472 static const struct nvme_core_quirk_entry core_quirks[] = {
2473         {
2474                 /*
2475                  * This Toshiba device seems to die using any APST states.  See:
2476                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2477                  */
2478                 .vid = 0x1179,
2479                 .mn = "THNSF5256GPUK TOSHIBA",
2480                 .quirks = NVME_QUIRK_NO_APST,
2481         },
2482         {
2483                 /*
2484                  * This LiteON CL1-3D*-Q11 firmware version has a race
2485                  * condition associated with actions related to suspend to idle
2486                  * LiteON has resolved the problem in future firmware
2487                  */
2488                 .vid = 0x14a4,
2489                 .fr = "22301111",
2490                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2491         },
2492         {
2493                 /*
2494                  * This Kioxia CD6-V Series / HPE PE8030 device times out and
2495                  * aborts I/O during any load, but more easily reproducible
2496                  * with discards (fstrim).
2497                  *
2498                  * The device is left in a state where it is also not possible
2499                  * to use "nvme set-feature" to disable APST, but booting with
2500                  * nvme_core.default_ps_max_latency=0 works.
2501                  */
2502                 .vid = 0x1e0f,
2503                 .mn = "KCD6XVUL6T40",
2504                 .quirks = NVME_QUIRK_NO_APST,
2505         }
2506 };
2507
2508 /* match is null-terminated but idstr is space-padded. */
2509 static bool string_matches(const char *idstr, const char *match, size_t len)
2510 {
2511         size_t matchlen;
2512
2513         if (!match)
2514                 return true;
2515
2516         matchlen = strlen(match);
2517         WARN_ON_ONCE(matchlen > len);
2518
2519         if (memcmp(idstr, match, matchlen))
2520                 return false;
2521
2522         for (; matchlen < len; matchlen++)
2523                 if (idstr[matchlen] != ' ')
2524                         return false;
2525
2526         return true;
2527 }
2528
2529 static bool quirk_matches(const struct nvme_id_ctrl *id,
2530                           const struct nvme_core_quirk_entry *q)
2531 {
2532         return q->vid == le16_to_cpu(id->vid) &&
2533                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2534                 string_matches(id->fr, q->fr, sizeof(id->fr));
2535 }
2536
2537 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2538                 struct nvme_id_ctrl *id)
2539 {
2540         size_t nqnlen;
2541         int off;
2542
2543         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2544                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2545                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2546                         strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2547                         return;
2548                 }
2549
2550                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2551                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2552         }
2553
2554         /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
2555         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2556                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2557                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2558         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2559         off += sizeof(id->sn);
2560         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2561         off += sizeof(id->mn);
2562         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2563 }
2564
2565 static void nvme_release_subsystem(struct device *dev)
2566 {
2567         struct nvme_subsystem *subsys =
2568                 container_of(dev, struct nvme_subsystem, dev);
2569
2570         if (subsys->instance >= 0)
2571                 ida_free(&nvme_instance_ida, subsys->instance);
2572         kfree(subsys);
2573 }
2574
2575 static void nvme_destroy_subsystem(struct kref *ref)
2576 {
2577         struct nvme_subsystem *subsys =
2578                         container_of(ref, struct nvme_subsystem, ref);
2579
2580         mutex_lock(&nvme_subsystems_lock);
2581         list_del(&subsys->entry);
2582         mutex_unlock(&nvme_subsystems_lock);
2583
2584         ida_destroy(&subsys->ns_ida);
2585         device_del(&subsys->dev);
2586         put_device(&subsys->dev);
2587 }
2588
2589 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2590 {
2591         kref_put(&subsys->ref, nvme_destroy_subsystem);
2592 }
2593
2594 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2595 {
2596         struct nvme_subsystem *subsys;
2597
2598         lockdep_assert_held(&nvme_subsystems_lock);
2599
2600         /*
2601          * Fail matches for discovery subsystems. This results
2602          * in each discovery controller bound to a unique subsystem.
2603          * This avoids issues with validating controller values
2604          * that can only be true when there is a single unique subsystem.
2605          * There may be multiple and completely independent entities
2606          * that provide discovery controllers.
2607          */
2608         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2609                 return NULL;
2610
2611         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2612                 if (strcmp(subsys->subnqn, subsysnqn))
2613                         continue;
2614                 if (!kref_get_unless_zero(&subsys->ref))
2615                         continue;
2616                 return subsys;
2617         }
2618
2619         return NULL;
2620 }
2621
2622 #define SUBSYS_ATTR_RO(_name, _mode, _show)                     \
2623         struct device_attribute subsys_attr_##_name = \
2624                 __ATTR(_name, _mode, _show, NULL)
2625
2626 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2627                                     struct device_attribute *attr,
2628                                     char *buf)
2629 {
2630         struct nvme_subsystem *subsys =
2631                 container_of(dev, struct nvme_subsystem, dev);
2632
2633         return sysfs_emit(buf, "%s\n", subsys->subnqn);
2634 }
2635 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2636
2637 static ssize_t nvme_subsys_show_type(struct device *dev,
2638                                     struct device_attribute *attr,
2639                                     char *buf)
2640 {
2641         struct nvme_subsystem *subsys =
2642                 container_of(dev, struct nvme_subsystem, dev);
2643
2644         switch (subsys->subtype) {
2645         case NVME_NQN_DISC:
2646                 return sysfs_emit(buf, "discovery\n");
2647         case NVME_NQN_NVME:
2648                 return sysfs_emit(buf, "nvm\n");
2649         default:
2650                 return sysfs_emit(buf, "reserved\n");
2651         }
2652 }
2653 static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
2654
2655 #define nvme_subsys_show_str_function(field)                            \
2656 static ssize_t subsys_##field##_show(struct device *dev,                \
2657                             struct device_attribute *attr, char *buf)   \
2658 {                                                                       \
2659         struct nvme_subsystem *subsys =                                 \
2660                 container_of(dev, struct nvme_subsystem, dev);          \
2661         return sysfs_emit(buf, "%.*s\n",                                \
2662                            (int)sizeof(subsys->field), subsys->field);  \
2663 }                                                                       \
2664 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2665
2666 nvme_subsys_show_str_function(model);
2667 nvme_subsys_show_str_function(serial);
2668 nvme_subsys_show_str_function(firmware_rev);
2669
2670 static struct attribute *nvme_subsys_attrs[] = {
2671         &subsys_attr_model.attr,
2672         &subsys_attr_serial.attr,
2673         &subsys_attr_firmware_rev.attr,
2674         &subsys_attr_subsysnqn.attr,
2675         &subsys_attr_subsystype.attr,
2676 #ifdef CONFIG_NVME_MULTIPATH
2677         &subsys_attr_iopolicy.attr,
2678 #endif
2679         NULL,
2680 };
2681
2682 static const struct attribute_group nvme_subsys_attrs_group = {
2683         .attrs = nvme_subsys_attrs,
2684 };
2685
2686 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2687         &nvme_subsys_attrs_group,
2688         NULL,
2689 };
2690
2691 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
2692 {
2693         return ctrl->opts && ctrl->opts->discovery_nqn;
2694 }
2695
2696 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2697                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2698 {
2699         struct nvme_ctrl *tmp;
2700
2701         lockdep_assert_held(&nvme_subsystems_lock);
2702
2703         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2704                 if (nvme_state_terminal(tmp))
2705                         continue;
2706
2707                 if (tmp->cntlid == ctrl->cntlid) {
2708                         dev_err(ctrl->device,
2709                                 "Duplicate cntlid %u with %s, subsys %s, rejecting\n",
2710                                 ctrl->cntlid, dev_name(tmp->device),
2711                                 subsys->subnqn);
2712                         return false;
2713                 }
2714
2715                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2716                     nvme_discovery_ctrl(ctrl))
2717                         continue;
2718
2719                 dev_err(ctrl->device,
2720                         "Subsystem does not support multiple controllers\n");
2721                 return false;
2722         }
2723
2724         return true;
2725 }
2726
2727 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2728 {
2729         struct nvme_subsystem *subsys, *found;
2730         int ret;
2731
2732         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2733         if (!subsys)
2734                 return -ENOMEM;
2735
2736         subsys->instance = -1;
2737         mutex_init(&subsys->lock);
2738         kref_init(&subsys->ref);
2739         INIT_LIST_HEAD(&subsys->ctrls);
2740         INIT_LIST_HEAD(&subsys->nsheads);
2741         nvme_init_subnqn(subsys, ctrl, id);
2742         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2743         memcpy(subsys->model, id->mn, sizeof(subsys->model));
2744         memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2745         subsys->vendor_id = le16_to_cpu(id->vid);
2746         subsys->cmic = id->cmic;
2747
2748         /* Versions prior to 1.4 don't necessarily report a valid type */
2749         if (id->cntrltype == NVME_CTRL_DISC ||
2750             !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
2751                 subsys->subtype = NVME_NQN_DISC;
2752         else
2753                 subsys->subtype = NVME_NQN_NVME;
2754
2755         if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
2756                 dev_err(ctrl->device,
2757                         "Subsystem %s is not a discovery controller",
2758                         subsys->subnqn);
2759                 kfree(subsys);
2760                 return -EINVAL;
2761         }
2762         subsys->awupf = le16_to_cpu(id->awupf);
2763         nvme_mpath_default_iopolicy(subsys);
2764
2765         subsys->dev.class = nvme_subsys_class;
2766         subsys->dev.release = nvme_release_subsystem;
2767         subsys->dev.groups = nvme_subsys_attrs_groups;
2768         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
2769         device_initialize(&subsys->dev);
2770
2771         mutex_lock(&nvme_subsystems_lock);
2772         found = __nvme_find_get_subsystem(subsys->subnqn);
2773         if (found) {
2774                 put_device(&subsys->dev);
2775                 subsys = found;
2776
2777                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2778                         ret = -EINVAL;
2779                         goto out_put_subsystem;
2780                 }
2781         } else {
2782                 ret = device_add(&subsys->dev);
2783                 if (ret) {
2784                         dev_err(ctrl->device,
2785                                 "failed to register subsystem device.\n");
2786                         put_device(&subsys->dev);
2787                         goto out_unlock;
2788                 }
2789                 ida_init(&subsys->ns_ida);
2790                 list_add_tail(&subsys->entry, &nvme_subsystems);
2791         }
2792
2793         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2794                                 dev_name(ctrl->device));
2795         if (ret) {
2796                 dev_err(ctrl->device,
2797                         "failed to create sysfs link from subsystem.\n");
2798                 goto out_put_subsystem;
2799         }
2800
2801         if (!found)
2802                 subsys->instance = ctrl->instance;
2803         ctrl->subsys = subsys;
2804         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2805         mutex_unlock(&nvme_subsystems_lock);
2806         return 0;
2807
2808 out_put_subsystem:
2809         nvme_put_subsystem(subsys);
2810 out_unlock:
2811         mutex_unlock(&nvme_subsystems_lock);
2812         return ret;
2813 }
2814
2815 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
2816                 void *log, size_t size, u64 offset)
2817 {
2818         struct nvme_command c = { };
2819         u32 dwlen = nvme_bytes_to_numd(size);
2820
2821         c.get_log_page.opcode = nvme_admin_get_log_page;
2822         c.get_log_page.nsid = cpu_to_le32(nsid);
2823         c.get_log_page.lid = log_page;
2824         c.get_log_page.lsp = lsp;
2825         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
2826         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
2827         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
2828         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
2829         c.get_log_page.csi = csi;
2830
2831         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2832 }
2833
2834 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
2835                                 struct nvme_effects_log **log)
2836 {
2837         struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
2838         int ret;
2839
2840         if (cel)
2841                 goto out;
2842
2843         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
2844         if (!cel)
2845                 return -ENOMEM;
2846
2847         ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
2848                         cel, sizeof(*cel), 0);
2849         if (ret) {
2850                 kfree(cel);
2851                 return ret;
2852         }
2853
2854         xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
2855 out:
2856         *log = cel;
2857         return 0;
2858 }
2859
2860 static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
2861 {
2862         u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
2863
2864         if (check_shl_overflow(1U, units + page_shift - 9, &val))
2865                 return UINT_MAX;
2866         return val;
2867 }
2868
2869 static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
2870 {
2871         struct nvme_command c = { };
2872         struct nvme_id_ctrl_nvm *id;
2873         int ret;
2874
2875         if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
2876                 ctrl->max_discard_sectors = UINT_MAX;
2877                 ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
2878         } else {
2879                 ctrl->max_discard_sectors = 0;
2880                 ctrl->max_discard_segments = 0;
2881         }
2882
2883         /*
2884          * Even though NVMe spec explicitly states that MDTS is not applicable
2885          * to the write-zeroes, we are cautious and limit the size to the
2886          * controllers max_hw_sectors value, which is based on the MDTS field
2887          * and possibly other limiting factors.
2888          */
2889         if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
2890             !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
2891                 ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
2892         else
2893                 ctrl->max_zeroes_sectors = 0;
2894
2895         if (nvme_ctrl_limited_cns(ctrl))
2896                 return 0;
2897
2898         id = kzalloc(sizeof(*id), GFP_KERNEL);
2899         if (!id)
2900                 return 0;
2901
2902         c.identify.opcode = nvme_admin_identify;
2903         c.identify.cns = NVME_ID_CNS_CS_CTRL;
2904         c.identify.csi = NVME_CSI_NVM;
2905
2906         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
2907         if (ret)
2908                 goto free_data;
2909
2910         if (id->dmrl)
2911                 ctrl->max_discard_segments = id->dmrl;
2912         if (id->dmrsl)
2913                 ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl);
2914         if (id->wzsl)
2915                 ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
2916
2917 free_data:
2918         kfree(id);
2919         return ret;
2920 }
2921
2922 static int nvme_init_identify(struct nvme_ctrl *ctrl)
2923 {
2924         struct nvme_id_ctrl *id;
2925         u32 max_hw_sectors;
2926         bool prev_apst_enabled;
2927         int ret;
2928
2929         ret = nvme_identify_ctrl(ctrl, &id);
2930         if (ret) {
2931                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
2932                 return -EIO;
2933         }
2934
2935         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2936                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
2937                 if (ret < 0)
2938                         goto out_free;
2939         }
2940
2941         if (!(ctrl->ops->flags & NVME_F_FABRICS))
2942                 ctrl->cntlid = le16_to_cpu(id->cntlid);
2943
2944         if (!ctrl->identified) {
2945                 unsigned int i;
2946
2947                 ret = nvme_init_subsystem(ctrl, id);
2948                 if (ret)
2949                         goto out_free;
2950
2951                 /*
2952                  * Check for quirks.  Quirk can depend on firmware version,
2953                  * so, in principle, the set of quirks present can change
2954                  * across a reset.  As a possible future enhancement, we
2955                  * could re-scan for quirks every time we reinitialize
2956                  * the device, but we'd have to make sure that the driver
2957                  * behaves intelligently if the quirks change.
2958                  */
2959                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
2960                         if (quirk_matches(id, &core_quirks[i]))
2961                                 ctrl->quirks |= core_quirks[i].quirks;
2962                 }
2963         }
2964
2965         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
2966                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
2967                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
2968         }
2969
2970         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
2971         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
2972         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
2973
2974         ctrl->oacs = le16_to_cpu(id->oacs);
2975         ctrl->oncs = le16_to_cpu(id->oncs);
2976         ctrl->mtfa = le16_to_cpu(id->mtfa);
2977         ctrl->oaes = le32_to_cpu(id->oaes);
2978         ctrl->wctemp = le16_to_cpu(id->wctemp);
2979         ctrl->cctemp = le16_to_cpu(id->cctemp);
2980
2981         atomic_set(&ctrl->abort_limit, id->acl + 1);
2982         ctrl->vwc = id->vwc;
2983         if (id->mdts)
2984                 max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
2985         else
2986                 max_hw_sectors = UINT_MAX;
2987         ctrl->max_hw_sectors =
2988                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
2989
2990         nvme_set_queue_limits(ctrl, ctrl->admin_q);
2991         ctrl->sgls = le32_to_cpu(id->sgls);
2992         ctrl->kas = le16_to_cpu(id->kas);
2993         ctrl->max_namespaces = le32_to_cpu(id->mnan);
2994         ctrl->ctratt = le32_to_cpu(id->ctratt);
2995
2996         ctrl->cntrltype = id->cntrltype;
2997         ctrl->dctype = id->dctype;
2998
2999         if (id->rtd3e) {
3000                 /* us -> s */
3001                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3002
3003                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3004                                                  shutdown_timeout, 60);
3005
3006                 if (ctrl->shutdown_timeout != shutdown_timeout)
3007                         dev_info(ctrl->device,
3008                                  "Shutdown timeout set to %u seconds\n",
3009                                  ctrl->shutdown_timeout);
3010         } else
3011                 ctrl->shutdown_timeout = shutdown_timeout;
3012
3013         ctrl->npss = id->npss;
3014         ctrl->apsta = id->apsta;
3015         prev_apst_enabled = ctrl->apst_enabled;
3016         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3017                 if (force_apst && id->apsta) {
3018                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3019                         ctrl->apst_enabled = true;
3020                 } else {
3021                         ctrl->apst_enabled = false;
3022                 }
3023         } else {
3024                 ctrl->apst_enabled = id->apsta;
3025         }
3026         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3027
3028         if (ctrl->ops->flags & NVME_F_FABRICS) {
3029                 ctrl->icdoff = le16_to_cpu(id->icdoff);
3030                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3031                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3032                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3033
3034                 /*
3035                  * In fabrics we need to verify the cntlid matches the
3036                  * admin connect
3037                  */
3038                 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3039                         dev_err(ctrl->device,
3040                                 "Mismatching cntlid: Connect %u vs Identify "
3041                                 "%u, rejecting\n",
3042                                 ctrl->cntlid, le16_to_cpu(id->cntlid));
3043                         ret = -EINVAL;
3044                         goto out_free;
3045                 }
3046
3047                 if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3048                         dev_err(ctrl->device,
3049                                 "keep-alive support is mandatory for fabrics\n");
3050                         ret = -EINVAL;
3051                         goto out_free;
3052                 }
3053         } else {
3054                 ctrl->hmpre = le32_to_cpu(id->hmpre);
3055                 ctrl->hmmin = le32_to_cpu(id->hmmin);
3056                 ctrl->hmminds = le32_to_cpu(id->hmminds);
3057                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3058         }
3059
3060         ret = nvme_mpath_init_identify(ctrl, id);
3061         if (ret < 0)
3062                 goto out_free;
3063
3064         if (ctrl->apst_enabled && !prev_apst_enabled)
3065                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3066         else if (!ctrl->apst_enabled && prev_apst_enabled)
3067                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3068
3069 out_free:
3070         kfree(id);
3071         return ret;
3072 }
3073
3074 /*
3075  * Initialize the cached copies of the Identify data and various controller
3076  * register in our nvme_ctrl structure.  This should be called as soon as
3077  * the admin queue is fully up and running.
3078  */
3079 int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
3080 {
3081         int ret;
3082
3083         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3084         if (ret) {
3085                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3086                 return ret;
3087         }
3088
3089         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3090
3091         if (ctrl->vs >= NVME_VS(1, 1, 0))
3092                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3093
3094         ret = nvme_init_identify(ctrl);
3095         if (ret)
3096                 return ret;
3097
3098         ret = nvme_init_non_mdts_limits(ctrl);
3099         if (ret < 0)
3100                 return ret;
3101
3102         ret = nvme_configure_apst(ctrl);
3103         if (ret < 0)
3104                 return ret;
3105
3106         ret = nvme_configure_timestamp(ctrl);
3107         if (ret < 0)
3108                 return ret;
3109
3110         ret = nvme_configure_directives(ctrl);
3111         if (ret < 0)
3112                 return ret;
3113
3114         ret = nvme_configure_acre(ctrl);
3115         if (ret < 0)
3116                 return ret;
3117
3118         if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3119                 ret = nvme_hwmon_init(ctrl);
3120                 if (ret < 0)
3121                         return ret;
3122         }
3123
3124         ctrl->identified = true;
3125
3126         return 0;
3127 }
3128 EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3129
3130 static int nvme_dev_open(struct inode *inode, struct file *file)
3131 {
3132         struct nvme_ctrl *ctrl =
3133                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3134
3135         switch (ctrl->state) {
3136         case NVME_CTRL_LIVE:
3137                 break;
3138         default:
3139                 return -EWOULDBLOCK;
3140         }
3141
3142         nvme_get_ctrl(ctrl);
3143         if (!try_module_get(ctrl->ops->module)) {
3144                 nvme_put_ctrl(ctrl);
3145                 return -EINVAL;
3146         }
3147
3148         file->private_data = ctrl;
3149         return 0;
3150 }
3151
3152 static int nvme_dev_release(struct inode *inode, struct file *file)
3153 {
3154         struct nvme_ctrl *ctrl =
3155                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3156
3157         module_put(ctrl->ops->module);
3158         nvme_put_ctrl(ctrl);
3159         return 0;
3160 }
3161
3162 static const struct file_operations nvme_dev_fops = {
3163         .owner          = THIS_MODULE,
3164         .open           = nvme_dev_open,
3165         .release        = nvme_dev_release,
3166         .unlocked_ioctl = nvme_dev_ioctl,
3167         .compat_ioctl   = compat_ptr_ioctl,
3168 };
3169
3170 static ssize_t nvme_sysfs_reset(struct device *dev,
3171                                 struct device_attribute *attr, const char *buf,
3172                                 size_t count)
3173 {
3174         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3175         int ret;
3176
3177         ret = nvme_reset_ctrl_sync(ctrl);
3178         if (ret < 0)
3179                 return ret;
3180         return count;
3181 }
3182 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
3183
3184 static ssize_t nvme_sysfs_rescan(struct device *dev,
3185                                 struct device_attribute *attr, const char *buf,
3186                                 size_t count)
3187 {
3188         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3189
3190         nvme_queue_scan(ctrl);
3191         return count;
3192 }
3193 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
3194
3195 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
3196 {
3197         struct gendisk *disk = dev_to_disk(dev);
3198
3199         if (disk->fops == &nvme_bdev_ops)
3200                 return nvme_get_ns_from_dev(dev)->head;
3201         else
3202                 return disk->private_data;
3203 }
3204
3205 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
3206                 char *buf)
3207 {
3208         struct nvme_ns_head *head = dev_to_ns_head(dev);
3209         struct nvme_ns_ids *ids = &head->ids;
3210         struct nvme_subsystem *subsys = head->subsys;
3211         int serial_len = sizeof(subsys->serial);
3212         int model_len = sizeof(subsys->model);
3213
3214         if (!uuid_is_null(&ids->uuid))
3215                 return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
3216
3217         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3218                 return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
3219
3220         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3221                 return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
3222
3223         while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
3224                                   subsys->serial[serial_len - 1] == '\0'))
3225                 serial_len--;
3226         while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
3227                                  subsys->model[model_len - 1] == '\0'))
3228                 model_len--;
3229
3230         return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3231                 serial_len, subsys->serial, model_len, subsys->model,
3232                 head->ns_id);
3233 }
3234 static DEVICE_ATTR_RO(wwid);
3235
3236 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
3237                 char *buf)
3238 {
3239         return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3240 }
3241 static DEVICE_ATTR_RO(nguid);
3242
3243 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
3244                 char *buf)
3245 {
3246         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3247
3248         /* For backward compatibility expose the NGUID to userspace if
3249          * we have no UUID set
3250          */
3251         if (uuid_is_null(&ids->uuid)) {
3252                 printk_ratelimited(KERN_WARNING
3253                                    "No UUID available providing old NGUID\n");
3254                 return sysfs_emit(buf, "%pU\n", ids->nguid);
3255         }
3256         return sysfs_emit(buf, "%pU\n", &ids->uuid);
3257 }
3258 static DEVICE_ATTR_RO(uuid);
3259
3260 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
3261                 char *buf)
3262 {
3263         return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3264 }
3265 static DEVICE_ATTR_RO(eui);
3266
3267 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
3268                 char *buf)
3269 {
3270         return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3271 }
3272 static DEVICE_ATTR_RO(nsid);
3273
3274 static struct attribute *nvme_ns_id_attrs[] = {
3275         &dev_attr_wwid.attr,
3276         &dev_attr_uuid.attr,
3277         &dev_attr_nguid.attr,
3278         &dev_attr_eui.attr,
3279         &dev_attr_nsid.attr,
3280 #ifdef CONFIG_NVME_MULTIPATH
3281         &dev_attr_ana_grpid.attr,
3282         &dev_attr_ana_state.attr,
3283 #endif
3284         NULL,
3285 };
3286
3287 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
3288                 struct attribute *a, int n)
3289 {
3290         struct device *dev = container_of(kobj, struct device, kobj);
3291         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3292
3293         if (a == &dev_attr_uuid.attr) {
3294                 if (uuid_is_null(&ids->uuid) &&
3295                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3296                         return 0;
3297         }
3298         if (a == &dev_attr_nguid.attr) {
3299                 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3300                         return 0;
3301         }
3302         if (a == &dev_attr_eui.attr) {
3303                 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3304                         return 0;
3305         }
3306 #ifdef CONFIG_NVME_MULTIPATH
3307         if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
3308                 if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */
3309                         return 0;
3310                 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
3311                         return 0;
3312         }
3313 #endif
3314         return a->mode;
3315 }
3316
3317 static const struct attribute_group nvme_ns_id_attr_group = {
3318         .attrs          = nvme_ns_id_attrs,
3319         .is_visible     = nvme_ns_id_attrs_are_visible,
3320 };
3321
3322 const struct attribute_group *nvme_ns_id_attr_groups[] = {
3323         &nvme_ns_id_attr_group,
3324         NULL,
3325 };
3326
3327 #define nvme_show_str_function(field)                                           \
3328 static ssize_t  field##_show(struct device *dev,                                \
3329                             struct device_attribute *attr, char *buf)           \
3330 {                                                                               \
3331         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3332         return sysfs_emit(buf, "%.*s\n",                                        \
3333                 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field);         \
3334 }                                                                               \
3335 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3336
3337 nvme_show_str_function(model);
3338 nvme_show_str_function(serial);
3339 nvme_show_str_function(firmware_rev);
3340
3341 #define nvme_show_int_function(field)                                           \
3342 static ssize_t  field##_show(struct device *dev,                                \
3343                             struct device_attribute *attr, char *buf)           \
3344 {                                                                               \
3345         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3346         return sysfs_emit(buf, "%d\n", ctrl->field);                            \
3347 }                                                                               \
3348 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3349
3350 nvme_show_int_function(cntlid);
3351 nvme_show_int_function(numa_node);
3352 nvme_show_int_function(queue_count);
3353 nvme_show_int_function(sqsize);
3354 nvme_show_int_function(kato);
3355
3356 static ssize_t nvme_sysfs_delete(struct device *dev,
3357                                 struct device_attribute *attr, const char *buf,
3358                                 size_t count)
3359 {
3360         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3361
3362         if (device_remove_file_self(dev, attr))
3363                 nvme_delete_ctrl_sync(ctrl);
3364         return count;
3365 }
3366 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
3367
3368 static ssize_t nvme_sysfs_show_transport(struct device *dev,
3369                                          struct device_attribute *attr,
3370                                          char *buf)
3371 {
3372         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3373
3374         return sysfs_emit(buf, "%s\n", ctrl->ops->name);
3375 }
3376 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
3377
3378 static ssize_t nvme_sysfs_show_state(struct device *dev,
3379                                      struct device_attribute *attr,
3380                                      char *buf)
3381 {
3382         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3383         static const char *const state_name[] = {
3384                 [NVME_CTRL_NEW]         = "new",
3385                 [NVME_CTRL_LIVE]        = "live",
3386                 [NVME_CTRL_RESETTING]   = "resetting",
3387                 [NVME_CTRL_CONNECTING]  = "connecting",
3388                 [NVME_CTRL_DELETING]    = "deleting",
3389                 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
3390                 [NVME_CTRL_DEAD]        = "dead",
3391         };
3392
3393         if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
3394             state_name[ctrl->state])
3395                 return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
3396
3397         return sysfs_emit(buf, "unknown state\n");
3398 }
3399
3400 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
3401
3402 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
3403                                          struct device_attribute *attr,
3404                                          char *buf)
3405 {
3406         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3407
3408         return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn);
3409 }
3410 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
3411
3412 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3413                                         struct device_attribute *attr,
3414                                         char *buf)
3415 {
3416         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3417
3418         return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn);
3419 }
3420 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3421
3422 static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3423                                         struct device_attribute *attr,
3424                                         char *buf)
3425 {
3426         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3427
3428         return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id);
3429 }
3430 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3431
3432 static ssize_t nvme_sysfs_show_address(struct device *dev,
3433                                          struct device_attribute *attr,
3434                                          char *buf)
3435 {
3436         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3437
3438         return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
3439 }
3440 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3441
3442 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3443                 struct device_attribute *attr, char *buf)
3444 {
3445         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3446         struct nvmf_ctrl_options *opts = ctrl->opts;
3447
3448         if (ctrl->opts->max_reconnects == -1)
3449                 return sysfs_emit(buf, "off\n");
3450         return sysfs_emit(buf, "%d\n",
3451                           opts->max_reconnects * opts->reconnect_delay);
3452 }
3453
3454 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3455                 struct device_attribute *attr, const char *buf, size_t count)
3456 {
3457         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3458         struct nvmf_ctrl_options *opts = ctrl->opts;
3459         int ctrl_loss_tmo, err;
3460
3461         err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3462         if (err)
3463                 return -EINVAL;
3464
3465         if (ctrl_loss_tmo < 0)
3466                 opts->max_reconnects = -1;
3467         else
3468                 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3469                                                 opts->reconnect_delay);
3470         return count;
3471 }
3472 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3473         nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3474
3475 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3476                 struct device_attribute *attr, char *buf)
3477 {
3478         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3479
3480         if (ctrl->opts->reconnect_delay == -1)
3481                 return sysfs_emit(buf, "off\n");
3482         return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
3483 }
3484
3485 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3486                 struct device_attribute *attr, const char *buf, size_t count)
3487 {
3488         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3489         unsigned int v;
3490         int err;
3491
3492         err = kstrtou32(buf, 10, &v);
3493         if (err)
3494                 return err;
3495
3496         ctrl->opts->reconnect_delay = v;
3497         return count;
3498 }
3499 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3500         nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
3501
3502 static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev,
3503                 struct device_attribute *attr, char *buf)
3504 {
3505         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3506
3507         if (ctrl->opts->fast_io_fail_tmo == -1)
3508                 return sysfs_emit(buf, "off\n");
3509         return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo);
3510 }
3511
3512 static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev,
3513                 struct device_attribute *attr, const char *buf, size_t count)
3514 {
3515         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3516         struct nvmf_ctrl_options *opts = ctrl->opts;
3517         int fast_io_fail_tmo, err;
3518
3519         err = kstrtoint(buf, 10, &fast_io_fail_tmo);
3520         if (err)
3521                 return -EINVAL;
3522
3523         if (fast_io_fail_tmo < 0)
3524                 opts->fast_io_fail_tmo = -1;
3525         else
3526                 opts->fast_io_fail_tmo = fast_io_fail_tmo;
3527         return count;
3528 }
3529 static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
3530         nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store);
3531
3532 static ssize_t cntrltype_show(struct device *dev,
3533                               struct device_attribute *attr, char *buf)
3534 {
3535         static const char * const type[] = {
3536                 [NVME_CTRL_IO] = "io\n",
3537                 [NVME_CTRL_DISC] = "discovery\n",
3538                 [NVME_CTRL_ADMIN] = "admin\n",
3539         };
3540         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3541
3542         if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype])
3543                 return sysfs_emit(buf, "reserved\n");
3544
3545         return sysfs_emit(buf, type[ctrl->cntrltype]);
3546 }
3547 static DEVICE_ATTR_RO(cntrltype);
3548
3549 static ssize_t dctype_show(struct device *dev,
3550                            struct device_attribute *attr, char *buf)
3551 {
3552         static const char * const type[] = {
3553                 [NVME_DCTYPE_NOT_REPORTED] = "none\n",
3554                 [NVME_DCTYPE_DDC] = "ddc\n",
3555                 [NVME_DCTYPE_CDC] = "cdc\n",
3556         };
3557         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3558
3559         if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype])
3560                 return sysfs_emit(buf, "reserved\n");
3561
3562         return sysfs_emit(buf, type[ctrl->dctype]);
3563 }
3564 static DEVICE_ATTR_RO(dctype);
3565
3566 static struct attribute *nvme_dev_attrs[] = {
3567         &dev_attr_reset_controller.attr,
3568         &dev_attr_rescan_controller.attr,
3569         &dev_attr_model.attr,
3570         &dev_attr_serial.attr,
3571         &dev_attr_firmware_rev.attr,
3572         &dev_attr_cntlid.attr,
3573         &dev_attr_delete_controller.attr,
3574         &dev_attr_transport.attr,
3575         &dev_attr_subsysnqn.attr,
3576         &dev_attr_address.attr,
3577         &dev_attr_state.attr,
3578         &dev_attr_numa_node.attr,
3579         &dev_attr_queue_count.attr,
3580         &dev_attr_sqsize.attr,
3581         &dev_attr_hostnqn.attr,
3582         &dev_attr_hostid.attr,
3583         &dev_attr_ctrl_loss_tmo.attr,
3584         &dev_attr_reconnect_delay.attr,
3585         &dev_attr_fast_io_fail_tmo.attr,
3586         &dev_attr_kato.attr,
3587         &dev_attr_cntrltype.attr,
3588         &dev_attr_dctype.attr,
3589         NULL
3590 };
3591
3592 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
3593                 struct attribute *a, int n)
3594 {
3595         struct device *dev = container_of(kobj, struct device, kobj);
3596         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3597
3598         if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
3599                 return 0;
3600         if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3601                 return 0;
3602         if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3603                 return 0;
3604         if (a == &dev_attr_hostid.attr && !ctrl->opts)
3605                 return 0;
3606         if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
3607                 return 0;
3608         if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
3609                 return 0;
3610         if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
3611                 return 0;
3612
3613         return a->mode;
3614 }
3615
3616 static const struct attribute_group nvme_dev_attrs_group = {
3617         .attrs          = nvme_dev_attrs,
3618         .is_visible     = nvme_dev_attrs_are_visible,
3619 };
3620
3621 static const struct attribute_group *nvme_dev_attr_groups[] = {
3622         &nvme_dev_attrs_group,
3623         NULL,
3624 };
3625
3626 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
3627                 unsigned nsid)
3628 {
3629         struct nvme_ns_head *h;
3630
3631         lockdep_assert_held(&subsys->lock);
3632
3633         list_for_each_entry(h, &subsys->nsheads, entry) {
3634                 if (h->ns_id != nsid)
3635                         continue;
3636                 if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3637                         return h;
3638         }
3639
3640         return NULL;
3641 }
3642
3643 static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3644                 struct nvme_ns_ids *ids)
3645 {
3646         bool has_uuid = !uuid_is_null(&ids->uuid);
3647         bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
3648         bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3649         struct nvme_ns_head *h;
3650
3651         lockdep_assert_held(&subsys->lock);
3652
3653         list_for_each_entry(h, &subsys->nsheads, entry) {
3654                 if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
3655                         return -EINVAL;
3656                 if (has_nguid &&
3657                     memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
3658                         return -EINVAL;
3659                 if (has_eui64 &&
3660                     memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3661                         return -EINVAL;
3662         }
3663
3664         return 0;
3665 }
3666
3667 static void nvme_cdev_rel(struct device *dev)
3668 {
3669         ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3670 }
3671
3672 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
3673 {
3674         cdev_device_del(cdev, cdev_device);
3675         put_device(cdev_device);
3676 }
3677
3678 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
3679                 const struct file_operations *fops, struct module *owner)
3680 {
3681         int minor, ret;
3682
3683         minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
3684         if (minor < 0)
3685                 return minor;
3686         cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
3687         cdev_device->class = nvme_ns_chr_class;
3688         cdev_device->release = nvme_cdev_rel;
3689         device_initialize(cdev_device);
3690         cdev_init(cdev, fops);
3691         cdev->owner = owner;
3692         ret = cdev_device_add(cdev, cdev_device);
3693         if (ret)
3694                 put_device(cdev_device);
3695
3696         return ret;
3697 }
3698
3699 static int nvme_ns_chr_open(struct inode *inode, struct file *file)
3700 {
3701         return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
3702 }
3703
3704 static int nvme_ns_chr_release(struct inode *inode, struct file *file)
3705 {
3706         nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
3707         return 0;
3708 }
3709
3710 static const struct file_operations nvme_ns_chr_fops = {
3711         .owner          = THIS_MODULE,
3712         .open           = nvme_ns_chr_open,
3713         .release        = nvme_ns_chr_release,
3714         .unlocked_ioctl = nvme_ns_chr_ioctl,
3715         .compat_ioctl   = compat_ptr_ioctl,
3716 };
3717
3718 static int nvme_add_ns_cdev(struct nvme_ns *ns)
3719 {
3720         int ret;
3721
3722         ns->cdev_device.parent = ns->ctrl->device;
3723         ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
3724                            ns->ctrl->instance, ns->head->instance);
3725         if (ret)
3726                 return ret;
3727
3728         return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
3729                              ns->ctrl->ops->module);
3730 }
3731
3732 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3733                 unsigned nsid, struct nvme_ns_ids *ids)
3734 {
3735         struct nvme_ns_head *head;
3736         size_t size = sizeof(*head);
3737         int ret = -ENOMEM;
3738
3739 #ifdef CONFIG_NVME_MULTIPATH
3740         size += num_possible_nodes() * sizeof(struct nvme_ns *);
3741 #endif
3742
3743         head = kzalloc(size, GFP_KERNEL);
3744         if (!head)
3745                 goto out;
3746         ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
3747         if (ret < 0)
3748                 goto out_free_head;
3749         head->instance = ret;
3750         INIT_LIST_HEAD(&head->list);
3751         ret = init_srcu_struct(&head->srcu);
3752         if (ret)
3753                 goto out_ida_remove;
3754         head->subsys = ctrl->subsys;
3755         head->ns_id = nsid;
3756         head->ids = *ids;
3757         kref_init(&head->ref);
3758
3759         if (head->ids.csi) {
3760                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3761                 if (ret)
3762                         goto out_cleanup_srcu;
3763         } else
3764                 head->effects = ctrl->effects;
3765
3766         ret = nvme_mpath_alloc_disk(ctrl, head);
3767         if (ret)
3768                 goto out_cleanup_srcu;
3769
3770         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3771
3772         kref_get(&ctrl->subsys->ref);
3773
3774         return head;
3775 out_cleanup_srcu:
3776         cleanup_srcu_struct(&head->srcu);
3777 out_ida_remove:
3778         ida_free(&ctrl->subsys->ns_ida, head->instance);
3779 out_free_head:
3780         kfree(head);
3781 out:
3782         if (ret > 0)
3783                 ret = blk_status_to_errno(nvme_error_status(ret));
3784         return ERR_PTR(ret);
3785 }
3786
3787 static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
3788                 struct nvme_ns_ids *ids)
3789 {
3790         struct nvme_subsystem *s;
3791         int ret = 0;
3792
3793         /*
3794          * Note that this check is racy as we try to avoid holding the global
3795          * lock over the whole ns_head creation.  But it is only intended as
3796          * a sanity check anyway.
3797          */
3798         mutex_lock(&nvme_subsystems_lock);
3799         list_for_each_entry(s, &nvme_subsystems, entry) {
3800                 if (s == this)
3801                         continue;
3802                 mutex_lock(&s->lock);
3803                 ret = nvme_subsys_check_duplicate_ids(s, ids);
3804                 mutex_unlock(&s->lock);
3805                 if (ret)
3806                         break;
3807         }
3808         mutex_unlock(&nvme_subsystems_lock);
3809
3810         return ret;
3811 }
3812
3813 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
3814                 struct nvme_ns_ids *ids, bool is_shared)
3815 {
3816         struct nvme_ctrl *ctrl = ns->ctrl;
3817         struct nvme_ns_head *head = NULL;
3818         int ret;
3819
3820         ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids);
3821         if (ret) {
3822                 dev_err(ctrl->device,
3823                         "globally duplicate IDs for nsid %d\n", nsid);
3824                 return ret;
3825         }
3826
3827         mutex_lock(&ctrl->subsys->lock);
3828         head = nvme_find_ns_head(ctrl->subsys, nsid);
3829         if (!head) {
3830                 ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
3831                 if (ret) {
3832                         dev_err(ctrl->device,
3833                                 "duplicate IDs in subsystem for nsid %d\n",
3834                                 nsid);
3835                         goto out_unlock;
3836                 }
3837                 head = nvme_alloc_ns_head(ctrl, nsid, ids);
3838                 if (IS_ERR(head)) {
3839                         ret = PTR_ERR(head);
3840                         goto out_unlock;
3841                 }
3842                 head->shared = is_shared;
3843         } else {
3844                 ret = -EINVAL;
3845                 if (!is_shared || !head->shared) {
3846                         dev_err(ctrl->device,
3847                                 "Duplicate unshared namespace %d\n", nsid);
3848                         goto out_put_ns_head;
3849                 }
3850                 if (!nvme_ns_ids_equal(&head->ids, ids)) {
3851                         dev_err(ctrl->device,
3852                                 "IDs don't match for shared namespace %d\n",
3853                                         nsid);
3854                         goto out_put_ns_head;
3855                 }
3856
3857                 if (!multipath && !list_empty(&head->list)) {
3858                         dev_warn(ctrl->device,
3859                                 "Found shared namespace %d, but multipathing not supported.\n",
3860                                 nsid);
3861                         dev_warn_once(ctrl->device,
3862                                 "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
3863                 }
3864         }
3865
3866         list_add_tail_rcu(&ns->siblings, &head->list);
3867         ns->head = head;
3868         mutex_unlock(&ctrl->subsys->lock);
3869         return 0;
3870
3871 out_put_ns_head:
3872         nvme_put_ns_head(head);
3873 out_unlock:
3874         mutex_unlock(&ctrl->subsys->lock);
3875         return ret;
3876 }
3877
3878 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3879 {
3880         struct nvme_ns *ns, *ret = NULL;
3881
3882         down_read(&ctrl->namespaces_rwsem);
3883         list_for_each_entry(ns, &ctrl->namespaces, list) {
3884                 if (ns->head->ns_id == nsid) {
3885                         if (!nvme_get_ns(ns))
3886                                 continue;
3887                         ret = ns;
3888                         break;
3889                 }
3890                 if (ns->head->ns_id > nsid)
3891                         break;
3892         }
3893         up_read(&ctrl->namespaces_rwsem);
3894         return ret;
3895 }
3896 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3897
3898 /*
3899  * Add the namespace to the controller list while keeping the list ordered.
3900  */
3901 static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
3902 {
3903         struct nvme_ns *tmp;
3904
3905         list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3906                 if (tmp->head->ns_id < ns->head->ns_id) {
3907                         list_add(&ns->list, &tmp->list);
3908                         return;
3909                 }
3910         }
3911         list_add(&ns->list, &ns->ctrl->namespaces);
3912 }
3913
3914 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
3915                 struct nvme_ns_ids *ids)
3916 {
3917         struct nvme_ns *ns;
3918         struct gendisk *disk;
3919         struct nvme_id_ns *id;
3920         int node = ctrl->numa_node;
3921
3922         if (nvme_identify_ns(ctrl, nsid, ids, &id))
3923                 return;
3924
3925         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3926         if (!ns)
3927                 goto out_free_id;
3928
3929         disk = blk_mq_alloc_disk(ctrl->tagset, ns);
3930         if (IS_ERR(disk))
3931                 goto out_free_ns;
3932         disk->fops = &nvme_bdev_ops;
3933         disk->private_data = ns;
3934
3935         ns->disk = disk;
3936         ns->queue = disk->queue;
3937
3938         if (ctrl->opts && ctrl->opts->data_digest)
3939                 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
3940
3941         blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3942         if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3943                 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
3944
3945         ns->ctrl = ctrl;
3946         kref_init(&ns->kref);
3947
3948         if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))
3949                 goto out_cleanup_disk;
3950
3951         /*
3952          * If multipathing is enabled, the device name for all disks and not
3953          * just those that represent shared namespaces needs to be based on the
3954          * subsystem instance.  Using the controller instance for private
3955          * namespaces could lead to naming collisions between shared and private
3956          * namespaces if they don't use a common numbering scheme.
3957          *
3958          * If multipathing is not enabled, disk names must use the controller
3959          * instance as shared namespaces will show up as multiple block
3960          * devices.
3961          */
3962         if (ns->head->disk) {
3963                 sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
3964                         ctrl->instance, ns->head->instance);
3965                 disk->flags |= GENHD_FL_HIDDEN;
3966         } else if (multipath) {
3967                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
3968                         ns->head->instance);
3969         } else {
3970                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
3971                         ns->head->instance);
3972         }
3973
3974         if (nvme_update_ns_info(ns, id))
3975                 goto out_unlink_ns;
3976
3977         down_write(&ctrl->namespaces_rwsem);
3978         nvme_ns_add_to_ctrl_list(ns);
3979         up_write(&ctrl->namespaces_rwsem);
3980         nvme_get_ctrl(ctrl);
3981
3982         if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
3983                 goto out_cleanup_ns_from_list;
3984
3985         if (!nvme_ns_head_multipath(ns->head))
3986                 nvme_add_ns_cdev(ns);
3987
3988         nvme_mpath_add_disk(ns, id);
3989         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3990         kfree(id);
3991
3992         return;
3993
3994  out_cleanup_ns_from_list:
3995         nvme_put_ctrl(ctrl);
3996         down_write(&ctrl->namespaces_rwsem);
3997         list_del_init(&ns->list);
3998         up_write(&ctrl->namespaces_rwsem);
3999  out_unlink_ns:
4000         mutex_lock(&ctrl->subsys->lock);
4001         list_del_rcu(&ns->siblings);
4002         if (list_empty(&ns->head->list))
4003                 list_del_init(&ns->head->entry);
4004         mutex_unlock(&ctrl->subsys->lock);
4005         nvme_put_ns_head(ns->head);
4006  out_cleanup_disk:
4007         blk_cleanup_disk(disk);
4008  out_free_ns:
4009         kfree(ns);
4010  out_free_id:
4011         kfree(id);
4012 }
4013
4014 static void nvme_ns_remove(struct nvme_ns *ns)
4015 {
4016         bool last_path = false;
4017
4018         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
4019                 return;
4020
4021         clear_bit(NVME_NS_READY, &ns->flags);
4022         set_capacity(ns->disk, 0);
4023         nvme_fault_inject_fini(&ns->fault_inject);
4024
4025         mutex_lock(&ns->ctrl->subsys->lock);
4026         list_del_rcu(&ns->siblings);
4027         if (list_empty(&ns->head->list)) {
4028                 list_del_init(&ns->head->entry);
4029                 last_path = true;
4030         }
4031         mutex_unlock(&ns->ctrl->subsys->lock);
4032
4033         /* guarantee not available in head->list */
4034         synchronize_rcu();
4035
4036         /* wait for concurrent submissions */
4037         if (nvme_mpath_clear_current_path(ns))
4038                 synchronize_srcu(&ns->head->srcu);
4039
4040         if (!nvme_ns_head_multipath(ns->head))
4041                 nvme_cdev_del(&ns->cdev, &ns->cdev_device);
4042         del_gendisk(ns->disk);
4043         blk_cleanup_queue(ns->queue);
4044
4045         down_write(&ns->ctrl->namespaces_rwsem);
4046         list_del_init(&ns->list);
4047         up_write(&ns->ctrl->namespaces_rwsem);
4048
4049         if (last_path)
4050                 nvme_mpath_shutdown_disk(ns->head);
4051         nvme_put_ns(ns);
4052 }
4053
4054 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
4055 {
4056         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4057
4058         if (ns) {
4059                 nvme_ns_remove(ns);
4060                 nvme_put_ns(ns);
4061         }
4062 }
4063
4064 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
4065 {
4066         struct nvme_id_ns *id;
4067         int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4068
4069         if (test_bit(NVME_NS_DEAD, &ns->flags))
4070                 goto out;
4071
4072         ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
4073         if (ret)
4074                 goto out;
4075
4076         ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4077         if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
4078                 dev_err(ns->ctrl->device,
4079                         "identifiers changed for nsid %d\n", ns->head->ns_id);
4080                 goto out_free_id;
4081         }
4082
4083         ret = nvme_update_ns_info(ns, id);
4084
4085 out_free_id:
4086         kfree(id);
4087 out:
4088         /*
4089          * Only remove the namespace if we got a fatal error back from the
4090          * device, otherwise ignore the error and just move on.
4091          *
4092          * TODO: we should probably schedule a delayed retry here.
4093          */
4094         if (ret > 0 && (ret & NVME_SC_DNR))
4095                 nvme_ns_remove(ns);
4096 }
4097
4098 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4099 {
4100         struct nvme_ns_ids ids = { };
4101         struct nvme_ns *ns;
4102
4103         if (nvme_identify_ns_descs(ctrl, nsid, &ids))
4104                 return;
4105
4106         ns = nvme_find_get_ns(ctrl, nsid);
4107         if (ns) {
4108                 nvme_validate_ns(ns, &ids);
4109                 nvme_put_ns(ns);
4110                 return;
4111         }
4112
4113         switch (ids.csi) {
4114         case NVME_CSI_NVM:
4115                 nvme_alloc_ns(ctrl, nsid, &ids);
4116                 break;
4117         case NVME_CSI_ZNS:
4118                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
4119                         dev_warn(ctrl->device,
4120                                 "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
4121                                 nsid);
4122                         break;
4123                 }
4124                 if (!nvme_multi_css(ctrl)) {
4125                         dev_warn(ctrl->device,
4126                                 "command set not reported for nsid: %d\n",
4127                                 nsid);
4128                         break;
4129                 }
4130                 nvme_alloc_ns(ctrl, nsid, &ids);
4131                 break;
4132         default:
4133                 dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
4134                         ids.csi, nsid);
4135                 break;
4136         }
4137 }
4138
4139 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4140                                         unsigned nsid)
4141 {
4142         struct nvme_ns *ns, *next;
4143         LIST_HEAD(rm_list);
4144
4145         down_write(&ctrl->namespaces_rwsem);
4146         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4147                 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
4148                         list_move_tail(&ns->list, &rm_list);
4149         }
4150         up_write(&ctrl->namespaces_rwsem);
4151
4152         list_for_each_entry_safe(ns, next, &rm_list, list)
4153                 nvme_ns_remove(ns);
4154
4155 }
4156
4157 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4158 {
4159         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4160         __le32 *ns_list;
4161         u32 prev = 0;
4162         int ret = 0, i;
4163
4164         if (nvme_ctrl_limited_cns(ctrl))
4165                 return -EOPNOTSUPP;
4166
4167         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4168         if (!ns_list)
4169                 return -ENOMEM;
4170
4171         for (;;) {
4172                 struct nvme_command cmd = {
4173                         .identify.opcode        = nvme_admin_identify,
4174                         .identify.cns           = NVME_ID_CNS_NS_ACTIVE_LIST,
4175                         .identify.nsid          = cpu_to_le32(prev),
4176                 };
4177
4178                 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4179                                             NVME_IDENTIFY_DATA_SIZE);
4180                 if (ret) {
4181                         dev_warn(ctrl->device,
4182                                 "Identify NS List failed (status=0x%x)\n", ret);
4183                         goto free;
4184                 }
4185
4186                 for (i = 0; i < nr_entries; i++) {
4187                         u32 nsid = le32_to_cpu(ns_list[i]);
4188
4189                         if (!nsid)      /* end of the list? */
4190                                 goto out;
4191                         nvme_validate_or_alloc_ns(ctrl, nsid);
4192                         while (++prev < nsid)
4193                                 nvme_ns_remove_by_nsid(ctrl, prev);
4194                 }
4195         }
4196  out:
4197         nvme_remove_invalid_namespaces(ctrl, prev);
4198  free:
4199         kfree(ns_list);
4200         return ret;
4201 }
4202
4203 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4204 {
4205         struct nvme_id_ctrl *id;
4206         u32 nn, i;
4207
4208         if (nvme_identify_ctrl(ctrl, &id))
4209                 return;
4210         nn = le32_to_cpu(id->nn);
4211         kfree(id);
4212
4213         for (i = 1; i <= nn; i++)
4214                 nvme_validate_or_alloc_ns(ctrl, i);
4215
4216         nvme_remove_invalid_namespaces(ctrl, nn);
4217 }
4218
4219 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4220 {
4221         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4222         __le32 *log;
4223         int error;
4224
4225         log = kzalloc(log_size, GFP_KERNEL);
4226         if (!log)
4227                 return;
4228
4229         /*
4230          * We need to read the log to clear the AEN, but we don't want to rely
4231          * on it for the changed namespace information as userspace could have
4232          * raced with us in reading the log page, which could cause us to miss
4233          * updates.
4234          */
4235         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4236                         NVME_CSI_NVM, log, log_size, 0);
4237         if (error)
4238                 dev_warn(ctrl->device,
4239                         "reading changed ns log failed: %d\n", error);
4240
4241         kfree(log);
4242 }
4243
4244 static void nvme_scan_work(struct work_struct *work)
4245 {
4246         struct nvme_ctrl *ctrl =
4247                 container_of(work, struct nvme_ctrl, scan_work);
4248
4249         /* No tagset on a live ctrl means IO queues could not created */
4250         if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
4251                 return;
4252
4253         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4254                 dev_info(ctrl->device, "rescanning namespaces.\n");
4255                 nvme_clear_changed_ns_log(ctrl);
4256         }
4257
4258         mutex_lock(&ctrl->scan_lock);
4259         if (nvme_scan_ns_list(ctrl) != 0)
4260                 nvme_scan_ns_sequential(ctrl);
4261         mutex_unlock(&ctrl->scan_lock);
4262 }
4263
4264 /*
4265  * This function iterates the namespace list unlocked to allow recovery from
4266  * controller failure. It is up to the caller to ensure the namespace list is
4267  * not modified by scan work while this function is executing.
4268  */
4269 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4270 {
4271         struct nvme_ns *ns, *next;
4272         LIST_HEAD(ns_list);
4273
4274         /*
4275          * make sure to requeue I/O to all namespaces as these
4276          * might result from the scan itself and must complete
4277          * for the scan_work to make progress
4278          */
4279         nvme_mpath_clear_ctrl_paths(ctrl);
4280
4281         /* prevent racing with ns scanning */
4282         flush_work(&ctrl->scan_work);
4283
4284         /*
4285          * The dead states indicates the controller was not gracefully
4286          * disconnected. In that case, we won't be able to flush any data while
4287          * removing the namespaces' disks; fail all the queues now to avoid
4288          * potentially having to clean up the failed sync later.
4289          */
4290         if (ctrl->state == NVME_CTRL_DEAD)
4291                 nvme_kill_queues(ctrl);
4292
4293         /* this is a no-op when called from the controller reset handler */
4294         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4295
4296         down_write(&ctrl->namespaces_rwsem);
4297         list_splice_init(&ctrl->namespaces, &ns_list);
4298         up_write(&ctrl->namespaces_rwsem);
4299
4300         list_for_each_entry_safe(ns, next, &ns_list, list)
4301                 nvme_ns_remove(ns);
4302 }
4303 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4304
4305 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
4306 {
4307         struct nvme_ctrl *ctrl =
4308                 container_of(dev, struct nvme_ctrl, ctrl_device);
4309         struct nvmf_ctrl_options *opts = ctrl->opts;
4310         int ret;
4311
4312         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4313         if (ret)
4314                 return ret;
4315
4316         if (opts) {
4317                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4318                 if (ret)
4319                         return ret;
4320
4321                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4322                                 opts->trsvcid ?: "none");
4323                 if (ret)
4324                         return ret;
4325
4326                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4327                                 opts->host_traddr ?: "none");
4328                 if (ret)
4329                         return ret;
4330
4331                 ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
4332                                 opts->host_iface ?: "none");
4333         }
4334         return ret;
4335 }
4336
4337 static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
4338 {
4339         char *envp[2] = { envdata, NULL };
4340
4341         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4342 }
4343
4344 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4345 {
4346         char *envp[2] = { NULL, NULL };
4347         u32 aen_result = ctrl->aen_result;
4348
4349         ctrl->aen_result = 0;
4350         if (!aen_result)
4351                 return;
4352
4353         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4354         if (!envp[0])
4355                 return;
4356         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4357         kfree(envp[0]);
4358 }
4359
4360 static void nvme_async_event_work(struct work_struct *work)
4361 {
4362         struct nvme_ctrl *ctrl =
4363                 container_of(work, struct nvme_ctrl, async_event_work);
4364
4365         nvme_aen_uevent(ctrl);
4366
4367         /*
4368          * The transport drivers must guarantee AER submission here is safe by
4369          * flushing ctrl async_event_work after changing the controller state
4370          * from LIVE and before freeing the admin queue.
4371         */
4372         if (ctrl->state == NVME_CTRL_LIVE)
4373                 ctrl->ops->submit_async_event(ctrl);
4374 }
4375
4376 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4377 {
4378
4379         u32 csts;
4380
4381         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4382                 return false;
4383
4384         if (csts == ~0)
4385                 return false;
4386
4387         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4388 }
4389
4390 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4391 {
4392         struct nvme_fw_slot_info_log *log;
4393
4394         log = kmalloc(sizeof(*log), GFP_KERNEL);
4395         if (!log)
4396                 return;
4397
4398         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4399                         log, sizeof(*log), 0))
4400                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4401         kfree(log);
4402 }
4403
4404 static void nvme_fw_act_work(struct work_struct *work)
4405 {
4406         struct nvme_ctrl *ctrl = container_of(work,
4407                                 struct nvme_ctrl, fw_act_work);
4408         unsigned long fw_act_timeout;
4409
4410         if (ctrl->mtfa)
4411                 fw_act_timeout = jiffies +
4412                                 msecs_to_jiffies(ctrl->mtfa * 100);
4413         else
4414                 fw_act_timeout = jiffies +
4415                                 msecs_to_jiffies(admin_timeout * 1000);
4416
4417         nvme_stop_queues(ctrl);
4418         while (nvme_ctrl_pp_status(ctrl)) {
4419                 if (time_after(jiffies, fw_act_timeout)) {
4420                         dev_warn(ctrl->device,
4421                                 "Fw activation timeout, reset controller\n");
4422                         nvme_try_sched_reset(ctrl);
4423                         return;
4424                 }
4425                 msleep(100);
4426         }
4427
4428         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4429                 return;
4430
4431         nvme_start_queues(ctrl);
4432         /* read FW slot information to clear the AER */
4433         nvme_get_fw_slot_info(ctrl);
4434 }
4435
4436 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4437 {
4438         u32 aer_notice_type = (result & 0xff00) >> 8;
4439
4440         trace_nvme_async_event(ctrl, aer_notice_type);
4441
4442         switch (aer_notice_type) {
4443         case NVME_AER_NOTICE_NS_CHANGED:
4444                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4445                 nvme_queue_scan(ctrl);
4446                 break;
4447         case NVME_AER_NOTICE_FW_ACT_STARTING:
4448                 /*
4449                  * We are (ab)using the RESETTING state to prevent subsequent
4450                  * recovery actions from interfering with the controller's
4451                  * firmware activation.
4452                  */
4453                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
4454                         queue_work(nvme_wq, &ctrl->fw_act_work);
4455                 break;
4456 #ifdef CONFIG_NVME_MULTIPATH
4457         case NVME_AER_NOTICE_ANA:
4458                 if (!ctrl->ana_log_buf)
4459                         break;
4460                 queue_work(nvme_wq, &ctrl->ana_work);
4461                 break;
4462 #endif
4463         case NVME_AER_NOTICE_DISC_CHANGED:
4464                 ctrl->aen_result = result;
4465                 break;
4466         default:
4467                 dev_warn(ctrl->device, "async event result %08x\n", result);
4468         }
4469 }
4470
4471 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4472                 volatile union nvme_result *res)
4473 {
4474         u32 result = le32_to_cpu(res->u32);
4475         u32 aer_type = result & 0x07;
4476
4477         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4478                 return;
4479
4480         switch (aer_type) {
4481         case NVME_AER_NOTICE:
4482                 nvme_handle_aen_notice(ctrl, result);
4483                 break;
4484         case NVME_AER_ERROR:
4485         case NVME_AER_SMART:
4486         case NVME_AER_CSS:
4487         case NVME_AER_VS:
4488                 trace_nvme_async_event(ctrl, aer_type);
4489                 ctrl->aen_result = result;
4490                 break;
4491         default:
4492                 break;
4493         }
4494         queue_work(nvme_wq, &ctrl->async_event_work);
4495 }
4496 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4497
4498 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4499 {
4500         nvme_mpath_stop(ctrl);
4501         nvme_stop_keep_alive(ctrl);
4502         nvme_stop_failfast_work(ctrl);
4503         flush_work(&ctrl->async_event_work);
4504         cancel_work_sync(&ctrl->fw_act_work);
4505 }
4506 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4507
4508 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4509 {
4510         nvme_start_keep_alive(ctrl);
4511
4512         nvme_enable_aen(ctrl);
4513
4514         if (ctrl->queue_count > 1) {
4515                 nvme_queue_scan(ctrl);
4516                 nvme_start_queues(ctrl);
4517         }
4518
4519         nvme_change_uevent(ctrl, "NVME_EVENT=connected");
4520 }
4521 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4522
4523 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4524 {
4525         nvme_hwmon_exit(ctrl);
4526         nvme_fault_inject_fini(&ctrl->fault_inject);
4527         dev_pm_qos_hide_latency_tolerance(ctrl->device);
4528         cdev_device_del(&ctrl->cdev, ctrl->device);
4529         nvme_put_ctrl(ctrl);
4530 }
4531 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4532
4533 static void nvme_free_cels(struct nvme_ctrl *ctrl)
4534 {
4535         struct nvme_effects_log *cel;
4536         unsigned long i;
4537
4538         xa_for_each(&ctrl->cels, i, cel) {
4539                 xa_erase(&ctrl->cels, i);
4540                 kfree(cel);
4541         }
4542
4543         xa_destroy(&ctrl->cels);
4544 }
4545
4546 static void nvme_free_ctrl(struct device *dev)
4547 {
4548         struct nvme_ctrl *ctrl =
4549                 container_of(dev, struct nvme_ctrl, ctrl_device);
4550         struct nvme_subsystem *subsys = ctrl->subsys;
4551
4552         if (!subsys || ctrl->instance != subsys->instance)
4553                 ida_free(&nvme_instance_ida, ctrl->instance);
4554
4555         nvme_free_cels(ctrl);
4556         nvme_mpath_uninit(ctrl);
4557         __free_page(ctrl->discard_page);
4558
4559         if (subsys) {
4560                 mutex_lock(&nvme_subsystems_lock);
4561                 list_del(&ctrl->subsys_entry);
4562                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4563                 mutex_unlock(&nvme_subsystems_lock);
4564         }
4565
4566         ctrl->ops->free_ctrl(ctrl);
4567
4568         if (subsys)
4569                 nvme_put_subsystem(subsys);
4570 }
4571
4572 /*
4573  * Initialize a NVMe controller structures.  This needs to be called during
4574  * earliest initialization so that we have the initialized structured around
4575  * during probing.
4576  */
4577 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4578                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4579 {
4580         int ret;
4581
4582         ctrl->state = NVME_CTRL_NEW;
4583         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
4584         spin_lock_init(&ctrl->lock);
4585         mutex_init(&ctrl->scan_lock);
4586         INIT_LIST_HEAD(&ctrl->namespaces);
4587         xa_init(&ctrl->cels);
4588         init_rwsem(&ctrl->namespaces_rwsem);
4589         ctrl->dev = dev;
4590         ctrl->ops = ops;
4591         ctrl->quirks = quirks;
4592         ctrl->numa_node = NUMA_NO_NODE;
4593         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4594         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4595         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4596         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4597         init_waitqueue_head(&ctrl->state_wq);
4598
4599         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4600         INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
4601         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4602         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4603
4604         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4605                         PAGE_SIZE);
4606         ctrl->discard_page = alloc_page(GFP_KERNEL);
4607         if (!ctrl->discard_page) {
4608                 ret = -ENOMEM;
4609                 goto out;
4610         }
4611
4612         ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
4613         if (ret < 0)
4614                 goto out;
4615         ctrl->instance = ret;
4616
4617         device_initialize(&ctrl->ctrl_device);
4618         ctrl->device = &ctrl->ctrl_device;
4619         ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
4620                         ctrl->instance);
4621         ctrl->device->class = nvme_class;
4622         ctrl->device->parent = ctrl->dev;
4623         ctrl->device->groups = nvme_dev_attr_groups;
4624         ctrl->device->release = nvme_free_ctrl;
4625         dev_set_drvdata(ctrl->device, ctrl);
4626         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4627         if (ret)
4628                 goto out_release_instance;
4629
4630         nvme_get_ctrl(ctrl);
4631         cdev_init(&ctrl->cdev, &nvme_dev_fops);
4632         ctrl->cdev.owner = ops->module;
4633         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4634         if (ret)
4635                 goto out_free_name;
4636
4637         /*
4638          * Initialize latency tolerance controls.  The sysfs files won't
4639          * be visible to userspace unless the device actually supports APST.
4640          */
4641         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4642         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4643                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4644
4645         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4646         nvme_mpath_init_ctrl(ctrl);
4647
4648         return 0;
4649 out_free_name:
4650         nvme_put_ctrl(ctrl);
4651         kfree_const(ctrl->device->kobj.name);
4652 out_release_instance:
4653         ida_free(&nvme_instance_ida, ctrl->instance);
4654 out:
4655         if (ctrl->discard_page)
4656                 __free_page(ctrl->discard_page);
4657         return ret;
4658 }
4659 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4660
4661 static void nvme_start_ns_queue(struct nvme_ns *ns)
4662 {
4663         if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
4664                 blk_mq_unquiesce_queue(ns->queue);
4665 }
4666
4667 static void nvme_stop_ns_queue(struct nvme_ns *ns)
4668 {
4669         if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
4670                 blk_mq_quiesce_queue(ns->queue);
4671         else
4672                 blk_mq_wait_quiesce_done(ns->queue);
4673 }
4674
4675 /*
4676  * Prepare a queue for teardown.
4677  *
4678  * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
4679  * the capacity to 0 after that to avoid blocking dispatchers that may be
4680  * holding bd_butex.  This will end buffered writers dirtying pages that can't
4681  * be synced.
4682  */
4683 static void nvme_set_queue_dying(struct nvme_ns *ns)
4684 {
4685         if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
4686                 return;
4687
4688         blk_mark_disk_dead(ns->disk);
4689         nvme_start_ns_queue(ns);
4690
4691         set_capacity_and_notify(ns->disk, 0);
4692 }
4693
4694 /**
4695  * nvme_kill_queues(): Ends all namespace queues
4696  * @ctrl: the dead controller that needs to end
4697  *
4698  * Call this function when the driver determines it is unable to get the
4699  * controller in a state capable of servicing IO.
4700  */
4701 void nvme_kill_queues(struct nvme_ctrl *ctrl)
4702 {
4703         struct nvme_ns *ns;
4704
4705         down_read(&ctrl->namespaces_rwsem);
4706
4707         /* Forcibly unquiesce queues to avoid blocking dispatch */
4708         if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
4709                 nvme_start_admin_queue(ctrl);
4710
4711         list_for_each_entry(ns, &ctrl->namespaces, list)
4712                 nvme_set_queue_dying(ns);
4713
4714         up_read(&ctrl->namespaces_rwsem);
4715 }
4716 EXPORT_SYMBOL_GPL(nvme_kill_queues);
4717
4718 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4719 {
4720         struct nvme_ns *ns;
4721
4722         down_read(&ctrl->namespaces_rwsem);
4723         list_for_each_entry(ns, &ctrl->namespaces, list)
4724                 blk_mq_unfreeze_queue(ns->queue);
4725         up_read(&ctrl->namespaces_rwsem);
4726 }
4727 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4728
4729 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4730 {
4731         struct nvme_ns *ns;
4732
4733         down_read(&ctrl->namespaces_rwsem);
4734         list_for_each_entry(ns, &ctrl->namespaces, list) {
4735                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4736                 if (timeout <= 0)
4737                         break;
4738         }
4739         up_read(&ctrl->namespaces_rwsem);
4740         return timeout;
4741 }
4742 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4743
4744 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4745 {
4746         struct nvme_ns *ns;
4747
4748         down_read(&ctrl->namespaces_rwsem);
4749         list_for_each_entry(ns, &ctrl->namespaces, list)
4750                 blk_mq_freeze_queue_wait(ns->queue);
4751         up_read(&ctrl->namespaces_rwsem);
4752 }
4753 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4754
4755 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4756 {
4757         struct nvme_ns *ns;
4758
4759         down_read(&ctrl->namespaces_rwsem);
4760         list_for_each_entry(ns, &ctrl->namespaces, list)
4761                 blk_freeze_queue_start(ns->queue);
4762         up_read(&ctrl->namespaces_rwsem);
4763 }
4764 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4765
4766 void nvme_stop_queues(struct nvme_ctrl *ctrl)
4767 {
4768         struct nvme_ns *ns;
4769
4770         down_read(&ctrl->namespaces_rwsem);
4771         list_for_each_entry(ns, &ctrl->namespaces, list)
4772                 nvme_stop_ns_queue(ns);
4773         up_read(&ctrl->namespaces_rwsem);
4774 }
4775 EXPORT_SYMBOL_GPL(nvme_stop_queues);
4776
4777 void nvme_start_queues(struct nvme_ctrl *ctrl)
4778 {
4779         struct nvme_ns *ns;
4780
4781         down_read(&ctrl->namespaces_rwsem);
4782         list_for_each_entry(ns, &ctrl->namespaces, list)
4783                 nvme_start_ns_queue(ns);
4784         up_read(&ctrl->namespaces_rwsem);
4785 }
4786 EXPORT_SYMBOL_GPL(nvme_start_queues);
4787
4788 void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
4789 {
4790         if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4791                 blk_mq_quiesce_queue(ctrl->admin_q);
4792         else
4793                 blk_mq_wait_quiesce_done(ctrl->admin_q);
4794 }
4795 EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
4796
4797 void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
4798 {
4799         if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4800                 blk_mq_unquiesce_queue(ctrl->admin_q);
4801 }
4802 EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
4803
4804 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4805 {
4806         struct nvme_ns *ns;
4807
4808         down_read(&ctrl->namespaces_rwsem);
4809         list_for_each_entry(ns, &ctrl->namespaces, list)
4810                 blk_sync_queue(ns->queue);
4811         up_read(&ctrl->namespaces_rwsem);
4812 }
4813 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
4814
4815 void nvme_sync_queues(struct nvme_ctrl *ctrl)
4816 {
4817         nvme_sync_io_queues(ctrl);
4818         if (ctrl->admin_q)
4819                 blk_sync_queue(ctrl->admin_q);
4820 }
4821 EXPORT_SYMBOL_GPL(nvme_sync_queues);
4822
4823 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4824 {
4825         if (file->f_op != &nvme_dev_fops)
4826                 return NULL;
4827         return file->private_data;
4828 }
4829 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4830
4831 /*
4832  * Check we didn't inadvertently grow the command structure sizes:
4833  */
4834 static inline void _nvme_check_size(void)
4835 {
4836         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4837         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4838         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4839         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4840         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4841         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4842         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4843         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4844         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4845         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
4846         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
4847         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
4848         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4849         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4850         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4851         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
4852         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
4853         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
4854         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
4855         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4856 }
4857
4858
4859 static int __init nvme_core_init(void)
4860 {
4861         int result = -ENOMEM;
4862
4863         _nvme_check_size();
4864
4865         nvme_wq = alloc_workqueue("nvme-wq",
4866                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4867         if (!nvme_wq)
4868                 goto out;
4869
4870         nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
4871                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4872         if (!nvme_reset_wq)
4873                 goto destroy_wq;
4874
4875         nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
4876                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4877         if (!nvme_delete_wq)
4878                 goto destroy_reset_wq;
4879
4880         result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
4881                         NVME_MINORS, "nvme");
4882         if (result < 0)
4883                 goto destroy_delete_wq;
4884
4885         nvme_class = class_create(THIS_MODULE, "nvme");
4886         if (IS_ERR(nvme_class)) {
4887                 result = PTR_ERR(nvme_class);
4888                 goto unregister_chrdev;
4889         }
4890         nvme_class->dev_uevent = nvme_class_uevent;
4891
4892         nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
4893         if (IS_ERR(nvme_subsys_class)) {
4894                 result = PTR_ERR(nvme_subsys_class);
4895                 goto destroy_class;
4896         }
4897
4898         result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
4899                                      "nvme-generic");
4900         if (result < 0)
4901                 goto destroy_subsys_class;
4902
4903         nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic");
4904         if (IS_ERR(nvme_ns_chr_class)) {
4905                 result = PTR_ERR(nvme_ns_chr_class);
4906                 goto unregister_generic_ns;
4907         }
4908
4909         return 0;
4910
4911 unregister_generic_ns:
4912         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
4913 destroy_subsys_class:
4914         class_destroy(nvme_subsys_class);
4915 destroy_class:
4916         class_destroy(nvme_class);
4917 unregister_chrdev:
4918         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
4919 destroy_delete_wq:
4920         destroy_workqueue(nvme_delete_wq);
4921 destroy_reset_wq:
4922         destroy_workqueue(nvme_reset_wq);
4923 destroy_wq:
4924         destroy_workqueue(nvme_wq);
4925 out:
4926         return result;
4927 }
4928
4929 static void __exit nvme_core_exit(void)
4930 {
4931         class_destroy(nvme_ns_chr_class);
4932         class_destroy(nvme_subsys_class);
4933         class_destroy(nvme_class);
4934         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
4935         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
4936         destroy_workqueue(nvme_delete_wq);
4937         destroy_workqueue(nvme_reset_wq);
4938         destroy_workqueue(nvme_wq);
4939         ida_destroy(&nvme_ns_chr_minor_ida);
4940         ida_destroy(&nvme_instance_ida);
4941 }
4942
4943 MODULE_LICENSE("GPL");
4944 MODULE_VERSION("1.0");
4945 module_init(nvme_core_init);
4946 module_exit(nvme_core_exit);