1 // SPDX-License-Identifier: GPL-2.0
3 * NVMe over Fabrics TCP host.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/key.h>
12 #include <linux/nvme-tcp.h>
13 #include <linux/nvme-keyring.h>
17 #include <net/tls_prot.h>
18 #include <net/handshake.h>
19 #include <linux/blk-mq.h>
20 #include <crypto/hash.h>
21 #include <net/busy_poll.h>
22 #include <trace/events/sock.h>
27 struct nvme_tcp_queue;
29 /* Define the socket priority to use for connections were it is desirable
30 * that the NIC consider performing optimized packet processing or filtering.
31 * A non-zero value being sufficient to indicate general consideration of any
32 * possible optimization. Making it a module param allows for alternative
33 * values that may be unique for some NIC implementations.
35 static int so_priority;
36 module_param(so_priority, int, 0644);
37 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
40 * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity
43 static bool wq_unbound;
44 module_param(wq_unbound, bool, 0644);
45 MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
48 * TLS handshake timeout
50 static int tls_handshake_timeout = 10;
51 #ifdef CONFIG_NVME_TCP_TLS
52 module_param(tls_handshake_timeout, int, 0644);
53 MODULE_PARM_DESC(tls_handshake_timeout,
54 "nvme TLS handshake timeout in seconds (default 10)");
57 #ifdef CONFIG_DEBUG_LOCK_ALLOC
58 /* lockdep can detect a circular dependency of the form
59 * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
60 * because dependencies are tracked for both nvme-tcp and user contexts. Using
61 * a separate class prevents lockdep from conflating nvme-tcp socket use with
62 * user-space socket API use.
64 static struct lock_class_key nvme_tcp_sk_key[2];
65 static struct lock_class_key nvme_tcp_slock_key[2];
67 static void nvme_tcp_reclassify_socket(struct socket *sock)
69 struct sock *sk = sock->sk;
71 if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
74 switch (sk->sk_family) {
76 sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
77 &nvme_tcp_slock_key[0],
78 "sk_lock-AF_INET-NVME",
82 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
83 &nvme_tcp_slock_key[1],
84 "sk_lock-AF_INET6-NVME",
92 static void nvme_tcp_reclassify_socket(struct socket *sock) { }
95 enum nvme_tcp_send_state {
96 NVME_TCP_SEND_CMD_PDU = 0,
97 NVME_TCP_SEND_H2C_PDU,
102 struct nvme_tcp_request {
103 struct nvme_request req;
105 struct nvme_tcp_queue *queue;
113 struct list_head entry;
114 struct llist_node lentry;
117 struct bio *curr_bio;
118 struct iov_iter iter;
123 enum nvme_tcp_send_state state;
126 enum nvme_tcp_queue_flags {
127 NVME_TCP_Q_ALLOCATED = 0,
129 NVME_TCP_Q_POLLING = 2,
132 enum nvme_tcp_recv_state {
133 NVME_TCP_RECV_PDU = 0,
138 struct nvme_tcp_ctrl;
139 struct nvme_tcp_queue {
141 struct work_struct io_work;
144 struct mutex queue_lock;
145 struct mutex send_mutex;
146 struct llist_head req_list;
147 struct list_head send_list;
153 size_t data_remaining;
154 size_t ddgst_remaining;
158 struct nvme_tcp_request *request;
161 size_t cmnd_capsule_len;
162 struct nvme_tcp_ctrl *ctrl;
168 struct ahash_request *rcv_hash;
169 struct ahash_request *snd_hash;
172 struct completion tls_complete;
174 struct page_frag_cache pf_cache;
176 void (*state_change)(struct sock *);
177 void (*data_ready)(struct sock *);
178 void (*write_space)(struct sock *);
181 struct nvme_tcp_ctrl {
182 /* read only in the hot path */
183 struct nvme_tcp_queue *queues;
184 struct blk_mq_tag_set tag_set;
186 /* other member variables */
187 struct list_head list;
188 struct blk_mq_tag_set admin_tag_set;
189 struct sockaddr_storage addr;
190 struct sockaddr_storage src_addr;
191 struct nvme_ctrl ctrl;
193 struct work_struct err_work;
194 struct delayed_work connect_work;
195 struct nvme_tcp_request async_req;
196 u32 io_queues[HCTX_MAX_TYPES];
199 static LIST_HEAD(nvme_tcp_ctrl_list);
200 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
201 static struct workqueue_struct *nvme_tcp_wq;
202 static const struct blk_mq_ops nvme_tcp_mq_ops;
203 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
204 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
206 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
208 return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
211 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
213 return queue - queue->ctrl->queues;
216 static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl)
218 if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
221 return ctrl->opts->tls;
224 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
226 u32 queue_idx = nvme_tcp_queue_id(queue);
229 return queue->ctrl->admin_tag_set.tags[queue_idx];
230 return queue->ctrl->tag_set.tags[queue_idx - 1];
233 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
235 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
238 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
240 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
243 static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
248 static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
250 /* use the pdu space in the back for the data pdu */
251 return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
252 sizeof(struct nvme_tcp_data_pdu);
255 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
257 if (nvme_is_fabrics(req->req.cmd))
258 return NVME_TCP_ADMIN_CCSZ;
259 return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
262 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
264 return req == &req->queue->ctrl->async_req;
267 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
271 if (unlikely(nvme_tcp_async_req(req)))
272 return false; /* async events don't have a request */
274 rq = blk_mq_rq_from_pdu(req);
276 return rq_data_dir(rq) == WRITE && req->data_len &&
277 req->data_len <= nvme_tcp_inline_data_size(req);
280 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
282 return req->iter.bvec->bv_page;
285 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
287 return req->iter.bvec->bv_offset + req->iter.iov_offset;
290 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
292 return min_t(size_t, iov_iter_single_seg_count(&req->iter),
293 req->pdu_len - req->pdu_sent);
296 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
298 return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
299 req->pdu_len - req->pdu_sent : 0;
302 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
305 return nvme_tcp_pdu_data_left(req) <= len;
308 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
311 struct request *rq = blk_mq_rq_from_pdu(req);
317 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
318 vec = &rq->special_vec;
320 size = blk_rq_payload_bytes(rq);
323 struct bio *bio = req->curr_bio;
327 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
329 bio_for_each_bvec(bv, bio, bi) {
332 size = bio->bi_iter.bi_size;
333 offset = bio->bi_iter.bi_bvec_done;
336 iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
337 req->iter.iov_offset = offset;
340 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
343 req->data_sent += len;
344 req->pdu_sent += len;
345 iov_iter_advance(&req->iter, len);
346 if (!iov_iter_count(&req->iter) &&
347 req->data_sent < req->data_len) {
348 req->curr_bio = req->curr_bio->bi_next;
349 nvme_tcp_init_iter(req, ITER_SOURCE);
353 static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
357 /* drain the send queue as much as we can... */
359 ret = nvme_tcp_try_send(queue);
363 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
365 return !list_empty(&queue->send_list) ||
366 !llist_empty(&queue->req_list);
369 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
370 bool sync, bool last)
372 struct nvme_tcp_queue *queue = req->queue;
375 empty = llist_add(&req->lentry, &queue->req_list) &&
376 list_empty(&queue->send_list) && !queue->request;
379 * if we're the first on the send_list and we can try to send
380 * directly, otherwise queue io_work. Also, only do that if we
381 * are on the same cpu, so we don't introduce contention.
383 if (queue->io_cpu == raw_smp_processor_id() &&
384 sync && empty && mutex_trylock(&queue->send_mutex)) {
385 nvme_tcp_send_all(queue);
386 mutex_unlock(&queue->send_mutex);
389 if (last && nvme_tcp_queue_more(queue))
390 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
393 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
395 struct nvme_tcp_request *req;
396 struct llist_node *node;
398 for (node = llist_del_all(&queue->req_list); node; node = node->next) {
399 req = llist_entry(node, struct nvme_tcp_request, lentry);
400 list_add(&req->entry, &queue->send_list);
404 static inline struct nvme_tcp_request *
405 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
407 struct nvme_tcp_request *req;
409 req = list_first_entry_or_null(&queue->send_list,
410 struct nvme_tcp_request, entry);
412 nvme_tcp_process_req_list(queue);
413 req = list_first_entry_or_null(&queue->send_list,
414 struct nvme_tcp_request, entry);
419 list_del(&req->entry);
423 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
426 ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
427 crypto_ahash_final(hash);
430 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
431 struct page *page, off_t off, size_t len)
433 struct scatterlist sg;
435 sg_init_table(&sg, 1);
436 sg_set_page(&sg, page, len, off);
437 ahash_request_set_crypt(hash, &sg, NULL, len);
438 crypto_ahash_update(hash);
441 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
442 void *pdu, size_t len)
444 struct scatterlist sg;
446 sg_init_one(&sg, pdu, len);
447 ahash_request_set_crypt(hash, &sg, pdu + len, len);
448 crypto_ahash_digest(hash);
451 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
452 void *pdu, size_t pdu_len)
454 struct nvme_tcp_hdr *hdr = pdu;
458 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
459 dev_err(queue->ctrl->ctrl.device,
460 "queue %d: header digest flag is cleared\n",
461 nvme_tcp_queue_id(queue));
465 recv_digest = *(__le32 *)(pdu + hdr->hlen);
466 nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
467 exp_digest = *(__le32 *)(pdu + hdr->hlen);
468 if (recv_digest != exp_digest) {
469 dev_err(queue->ctrl->ctrl.device,
470 "header digest error: recv %#x expected %#x\n",
471 le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
478 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
480 struct nvme_tcp_hdr *hdr = pdu;
481 u8 digest_len = nvme_tcp_hdgst_len(queue);
484 len = le32_to_cpu(hdr->plen) - hdr->hlen -
485 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
487 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
488 dev_err(queue->ctrl->ctrl.device,
489 "queue %d: data digest flag is cleared\n",
490 nvme_tcp_queue_id(queue));
493 crypto_ahash_init(queue->rcv_hash);
498 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
499 struct request *rq, unsigned int hctx_idx)
501 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
503 page_frag_free(req->pdu);
506 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
507 struct request *rq, unsigned int hctx_idx,
508 unsigned int numa_node)
510 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
511 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
512 struct nvme_tcp_cmd_pdu *pdu;
513 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
514 struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
515 u8 hdgst = nvme_tcp_hdgst_len(queue);
517 req->pdu = page_frag_alloc(&queue->pf_cache,
518 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
519 GFP_KERNEL | __GFP_ZERO);
525 nvme_req(rq)->ctrl = &ctrl->ctrl;
526 nvme_req(rq)->cmd = &pdu->cmd;
531 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
532 unsigned int hctx_idx)
534 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
535 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
537 hctx->driver_data = queue;
541 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
542 unsigned int hctx_idx)
544 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
545 struct nvme_tcp_queue *queue = &ctrl->queues[0];
547 hctx->driver_data = queue;
551 static enum nvme_tcp_recv_state
552 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
554 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
555 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
559 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
561 queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
562 nvme_tcp_hdgst_len(queue);
563 queue->pdu_offset = 0;
564 queue->data_remaining = -1;
565 queue->ddgst_remaining = 0;
568 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
570 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
573 dev_warn(ctrl->device, "starting error recovery\n");
574 queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
577 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
578 struct nvme_completion *cqe)
580 struct nvme_tcp_request *req;
583 rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
585 dev_err(queue->ctrl->ctrl.device,
586 "got bad cqe.command_id %#x on queue %d\n",
587 cqe->command_id, nvme_tcp_queue_id(queue));
588 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
592 req = blk_mq_rq_to_pdu(rq);
593 if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
594 req->status = cqe->status;
596 if (!nvme_try_complete_req(rq, req->status, cqe->result))
597 nvme_complete_rq(rq);
603 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
604 struct nvme_tcp_data_pdu *pdu)
608 rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
610 dev_err(queue->ctrl->ctrl.device,
611 "got bad c2hdata.command_id %#x on queue %d\n",
612 pdu->command_id, nvme_tcp_queue_id(queue));
616 if (!blk_rq_payload_bytes(rq)) {
617 dev_err(queue->ctrl->ctrl.device,
618 "queue %d tag %#x unexpected data\n",
619 nvme_tcp_queue_id(queue), rq->tag);
623 queue->data_remaining = le32_to_cpu(pdu->data_length);
625 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
626 unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
627 dev_err(queue->ctrl->ctrl.device,
628 "queue %d tag %#x SUCCESS set but not last PDU\n",
629 nvme_tcp_queue_id(queue), rq->tag);
630 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
637 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
638 struct nvme_tcp_rsp_pdu *pdu)
640 struct nvme_completion *cqe = &pdu->cqe;
644 * AEN requests are special as they don't time out and can
645 * survive any kind of queue freeze and often don't respond to
646 * aborts. We don't even bother to allocate a struct request
647 * for them but rather special case them here.
649 if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
651 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
654 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
659 static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
661 struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
662 struct nvme_tcp_queue *queue = req->queue;
663 struct request *rq = blk_mq_rq_from_pdu(req);
664 u32 h2cdata_sent = req->pdu_len;
665 u8 hdgst = nvme_tcp_hdgst_len(queue);
666 u8 ddgst = nvme_tcp_ddgst_len(queue);
668 req->state = NVME_TCP_SEND_H2C_PDU;
670 req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
672 req->h2cdata_left -= req->pdu_len;
673 req->h2cdata_offset += h2cdata_sent;
675 memset(data, 0, sizeof(*data));
676 data->hdr.type = nvme_tcp_h2c_data;
677 if (!req->h2cdata_left)
678 data->hdr.flags = NVME_TCP_F_DATA_LAST;
679 if (queue->hdr_digest)
680 data->hdr.flags |= NVME_TCP_F_HDGST;
681 if (queue->data_digest)
682 data->hdr.flags |= NVME_TCP_F_DDGST;
683 data->hdr.hlen = sizeof(*data);
684 data->hdr.pdo = data->hdr.hlen + hdgst;
686 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
687 data->ttag = req->ttag;
688 data->command_id = nvme_cid(rq);
689 data->data_offset = cpu_to_le32(req->h2cdata_offset);
690 data->data_length = cpu_to_le32(req->pdu_len);
693 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
694 struct nvme_tcp_r2t_pdu *pdu)
696 struct nvme_tcp_request *req;
698 u32 r2t_length = le32_to_cpu(pdu->r2t_length);
699 u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
701 rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
703 dev_err(queue->ctrl->ctrl.device,
704 "got bad r2t.command_id %#x on queue %d\n",
705 pdu->command_id, nvme_tcp_queue_id(queue));
708 req = blk_mq_rq_to_pdu(rq);
710 if (unlikely(!r2t_length)) {
711 dev_err(queue->ctrl->ctrl.device,
712 "req %d r2t len is %u, probably a bug...\n",
713 rq->tag, r2t_length);
717 if (unlikely(req->data_sent + r2t_length > req->data_len)) {
718 dev_err(queue->ctrl->ctrl.device,
719 "req %d r2t len %u exceeded data len %u (%zu sent)\n",
720 rq->tag, r2t_length, req->data_len, req->data_sent);
724 if (unlikely(r2t_offset < req->data_sent)) {
725 dev_err(queue->ctrl->ctrl.device,
726 "req %d unexpected r2t offset %u (expected %zu)\n",
727 rq->tag, r2t_offset, req->data_sent);
732 req->h2cdata_left = r2t_length;
733 req->h2cdata_offset = r2t_offset;
734 req->ttag = pdu->ttag;
736 nvme_tcp_setup_h2c_data_pdu(req);
737 nvme_tcp_queue_request(req, false, true);
742 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
743 unsigned int *offset, size_t *len)
745 struct nvme_tcp_hdr *hdr;
746 char *pdu = queue->pdu;
747 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
750 ret = skb_copy_bits(skb, *offset,
751 &pdu[queue->pdu_offset], rcv_len);
755 queue->pdu_remaining -= rcv_len;
756 queue->pdu_offset += rcv_len;
759 if (queue->pdu_remaining)
763 if (queue->hdr_digest) {
764 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
770 if (queue->data_digest) {
771 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
777 case nvme_tcp_c2h_data:
778 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
780 nvme_tcp_init_recv_ctx(queue);
781 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
783 nvme_tcp_init_recv_ctx(queue);
784 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
786 dev_err(queue->ctrl->ctrl.device,
787 "unsupported pdu type (%d)\n", hdr->type);
792 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
794 union nvme_result res = {};
796 if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
797 nvme_complete_rq(rq);
800 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
801 unsigned int *offset, size_t *len)
803 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
805 nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
806 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
811 recv_len = min_t(size_t, *len, queue->data_remaining);
815 if (!iov_iter_count(&req->iter)) {
816 req->curr_bio = req->curr_bio->bi_next;
819 * If we don`t have any bios it means that controller
820 * sent more data than we requested, hence error
822 if (!req->curr_bio) {
823 dev_err(queue->ctrl->ctrl.device,
824 "queue %d no space in request %#x",
825 nvme_tcp_queue_id(queue), rq->tag);
826 nvme_tcp_init_recv_ctx(queue);
829 nvme_tcp_init_iter(req, ITER_DEST);
832 /* we can read only from what is left in this bio */
833 recv_len = min_t(size_t, recv_len,
834 iov_iter_count(&req->iter));
836 if (queue->data_digest)
837 ret = skb_copy_and_hash_datagram_iter(skb, *offset,
838 &req->iter, recv_len, queue->rcv_hash);
840 ret = skb_copy_datagram_iter(skb, *offset,
841 &req->iter, recv_len);
843 dev_err(queue->ctrl->ctrl.device,
844 "queue %d failed to copy request %#x data",
845 nvme_tcp_queue_id(queue), rq->tag);
851 queue->data_remaining -= recv_len;
854 if (!queue->data_remaining) {
855 if (queue->data_digest) {
856 nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
857 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
859 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
860 nvme_tcp_end_request(rq,
861 le16_to_cpu(req->status));
864 nvme_tcp_init_recv_ctx(queue);
871 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
872 struct sk_buff *skb, unsigned int *offset, size_t *len)
874 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
875 char *ddgst = (char *)&queue->recv_ddgst;
876 size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
877 off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
880 ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
884 queue->ddgst_remaining -= recv_len;
887 if (queue->ddgst_remaining)
890 if (queue->recv_ddgst != queue->exp_ddgst) {
891 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
893 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
895 req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
897 dev_err(queue->ctrl->ctrl.device,
898 "data digest error: recv %#x expected %#x\n",
899 le32_to_cpu(queue->recv_ddgst),
900 le32_to_cpu(queue->exp_ddgst));
903 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
904 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
906 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
908 nvme_tcp_end_request(rq, le16_to_cpu(req->status));
912 nvme_tcp_init_recv_ctx(queue);
916 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
917 unsigned int offset, size_t len)
919 struct nvme_tcp_queue *queue = desc->arg.data;
920 size_t consumed = len;
923 if (unlikely(!queue->rd_enabled))
927 switch (nvme_tcp_recv_state(queue)) {
928 case NVME_TCP_RECV_PDU:
929 result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
931 case NVME_TCP_RECV_DATA:
932 result = nvme_tcp_recv_data(queue, skb, &offset, &len);
934 case NVME_TCP_RECV_DDGST:
935 result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
941 dev_err(queue->ctrl->ctrl.device,
942 "receive failed: %d\n", result);
943 queue->rd_enabled = false;
944 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
952 static void nvme_tcp_data_ready(struct sock *sk)
954 struct nvme_tcp_queue *queue;
956 trace_sk_data_ready(sk);
958 read_lock_bh(&sk->sk_callback_lock);
959 queue = sk->sk_user_data;
960 if (likely(queue && queue->rd_enabled) &&
961 !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
962 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
963 read_unlock_bh(&sk->sk_callback_lock);
966 static void nvme_tcp_write_space(struct sock *sk)
968 struct nvme_tcp_queue *queue;
970 read_lock_bh(&sk->sk_callback_lock);
971 queue = sk->sk_user_data;
972 if (likely(queue && sk_stream_is_writeable(sk))) {
973 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
974 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
976 read_unlock_bh(&sk->sk_callback_lock);
979 static void nvme_tcp_state_change(struct sock *sk)
981 struct nvme_tcp_queue *queue;
983 read_lock_bh(&sk->sk_callback_lock);
984 queue = sk->sk_user_data;
988 switch (sk->sk_state) {
994 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
997 dev_info(queue->ctrl->ctrl.device,
998 "queue %d socket state %d\n",
999 nvme_tcp_queue_id(queue), sk->sk_state);
1002 queue->state_change(sk);
1004 read_unlock_bh(&sk->sk_callback_lock);
1007 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
1009 queue->request = NULL;
1012 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
1014 if (nvme_tcp_async_req(req)) {
1015 union nvme_result res = {};
1017 nvme_complete_async_event(&req->queue->ctrl->ctrl,
1018 cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
1020 nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
1021 NVME_SC_HOST_PATH_ERROR);
1025 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
1027 struct nvme_tcp_queue *queue = req->queue;
1028 int req_data_len = req->data_len;
1029 u32 h2cdata_left = req->h2cdata_left;
1032 struct bio_vec bvec;
1033 struct msghdr msg = {
1034 .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
1036 struct page *page = nvme_tcp_req_cur_page(req);
1037 size_t offset = nvme_tcp_req_cur_offset(req);
1038 size_t len = nvme_tcp_req_cur_length(req);
1039 bool last = nvme_tcp_pdu_last_send(req, len);
1040 int req_data_sent = req->data_sent;
1043 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
1044 msg.msg_flags |= MSG_EOR;
1046 msg.msg_flags |= MSG_MORE;
1048 if (!sendpage_ok(page))
1049 msg.msg_flags &= ~MSG_SPLICE_PAGES;
1051 bvec_set_page(&bvec, page, len, offset);
1052 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1053 ret = sock_sendmsg(queue->sock, &msg);
1057 if (queue->data_digest)
1058 nvme_tcp_ddgst_update(queue->snd_hash, page,
1062 * update the request iterator except for the last payload send
1063 * in the request where we don't want to modify it as we may
1064 * compete with the RX path completing the request.
1066 if (req_data_sent + ret < req_data_len)
1067 nvme_tcp_advance_req(req, ret);
1069 /* fully successful last send in current PDU */
1070 if (last && ret == len) {
1071 if (queue->data_digest) {
1072 nvme_tcp_ddgst_final(queue->snd_hash,
1074 req->state = NVME_TCP_SEND_DDGST;
1078 nvme_tcp_setup_h2c_data_pdu(req);
1080 nvme_tcp_done_send_req(queue);
1088 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
1090 struct nvme_tcp_queue *queue = req->queue;
1091 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
1092 struct bio_vec bvec;
1093 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
1094 bool inline_data = nvme_tcp_has_inline_data(req);
1095 u8 hdgst = nvme_tcp_hdgst_len(queue);
1096 int len = sizeof(*pdu) + hdgst - req->offset;
1099 if (inline_data || nvme_tcp_queue_more(queue))
1100 msg.msg_flags |= MSG_MORE;
1102 msg.msg_flags |= MSG_EOR;
1104 if (queue->hdr_digest && !req->offset)
1105 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1107 bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
1108 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1109 ret = sock_sendmsg(queue->sock, &msg);
1110 if (unlikely(ret <= 0))
1116 req->state = NVME_TCP_SEND_DATA;
1117 if (queue->data_digest)
1118 crypto_ahash_init(queue->snd_hash);
1120 nvme_tcp_done_send_req(queue);
1129 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1131 struct nvme_tcp_queue *queue = req->queue;
1132 struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
1133 struct bio_vec bvec;
1134 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
1135 u8 hdgst = nvme_tcp_hdgst_len(queue);
1136 int len = sizeof(*pdu) - req->offset + hdgst;
1139 if (queue->hdr_digest && !req->offset)
1140 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1142 if (!req->h2cdata_left)
1143 msg.msg_flags |= MSG_SPLICE_PAGES;
1145 bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
1146 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1147 ret = sock_sendmsg(queue->sock, &msg);
1148 if (unlikely(ret <= 0))
1153 req->state = NVME_TCP_SEND_DATA;
1154 if (queue->data_digest)
1155 crypto_ahash_init(queue->snd_hash);
1163 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1165 struct nvme_tcp_queue *queue = req->queue;
1166 size_t offset = req->offset;
1167 u32 h2cdata_left = req->h2cdata_left;
1169 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1171 .iov_base = (u8 *)&req->ddgst + req->offset,
1172 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1175 if (nvme_tcp_queue_more(queue))
1176 msg.msg_flags |= MSG_MORE;
1178 msg.msg_flags |= MSG_EOR;
1180 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1181 if (unlikely(ret <= 0))
1184 if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1186 nvme_tcp_setup_h2c_data_pdu(req);
1188 nvme_tcp_done_send_req(queue);
1196 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1198 struct nvme_tcp_request *req;
1199 unsigned int noreclaim_flag;
1202 if (!queue->request) {
1203 queue->request = nvme_tcp_fetch_request(queue);
1204 if (!queue->request)
1207 req = queue->request;
1209 noreclaim_flag = memalloc_noreclaim_save();
1210 if (req->state == NVME_TCP_SEND_CMD_PDU) {
1211 ret = nvme_tcp_try_send_cmd_pdu(req);
1214 if (!nvme_tcp_has_inline_data(req))
1218 if (req->state == NVME_TCP_SEND_H2C_PDU) {
1219 ret = nvme_tcp_try_send_data_pdu(req);
1224 if (req->state == NVME_TCP_SEND_DATA) {
1225 ret = nvme_tcp_try_send_data(req);
1230 if (req->state == NVME_TCP_SEND_DDGST)
1231 ret = nvme_tcp_try_send_ddgst(req);
1233 if (ret == -EAGAIN) {
1235 } else if (ret < 0) {
1236 dev_err(queue->ctrl->ctrl.device,
1237 "failed to send request %d\n", ret);
1238 nvme_tcp_fail_request(queue->request);
1239 nvme_tcp_done_send_req(queue);
1242 memalloc_noreclaim_restore(noreclaim_flag);
1246 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1248 struct socket *sock = queue->sock;
1249 struct sock *sk = sock->sk;
1250 read_descriptor_t rd_desc;
1253 rd_desc.arg.data = queue;
1257 consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1262 static void nvme_tcp_io_work(struct work_struct *w)
1264 struct nvme_tcp_queue *queue =
1265 container_of(w, struct nvme_tcp_queue, io_work);
1266 unsigned long deadline = jiffies + msecs_to_jiffies(1);
1269 bool pending = false;
1272 if (mutex_trylock(&queue->send_mutex)) {
1273 result = nvme_tcp_try_send(queue);
1274 mutex_unlock(&queue->send_mutex);
1277 else if (unlikely(result < 0))
1281 result = nvme_tcp_try_recv(queue);
1284 else if (unlikely(result < 0))
1287 if (!pending || !queue->rd_enabled)
1290 } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1292 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1295 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1297 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1299 ahash_request_free(queue->rcv_hash);
1300 ahash_request_free(queue->snd_hash);
1301 crypto_free_ahash(tfm);
1304 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1306 struct crypto_ahash *tfm;
1308 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1310 return PTR_ERR(tfm);
1312 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1313 if (!queue->snd_hash)
1315 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1317 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1318 if (!queue->rcv_hash)
1320 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1324 ahash_request_free(queue->snd_hash);
1326 crypto_free_ahash(tfm);
1330 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1332 struct nvme_tcp_request *async = &ctrl->async_req;
1334 page_frag_free(async->pdu);
1337 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1339 struct nvme_tcp_queue *queue = &ctrl->queues[0];
1340 struct nvme_tcp_request *async = &ctrl->async_req;
1341 u8 hdgst = nvme_tcp_hdgst_len(queue);
1343 async->pdu = page_frag_alloc(&queue->pf_cache,
1344 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1345 GFP_KERNEL | __GFP_ZERO);
1349 async->queue = &ctrl->queues[0];
1353 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1355 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1356 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1357 unsigned int noreclaim_flag;
1359 if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1362 if (queue->hdr_digest || queue->data_digest)
1363 nvme_tcp_free_crypto(queue);
1365 page_frag_cache_drain(&queue->pf_cache);
1367 noreclaim_flag = memalloc_noreclaim_save();
1368 /* ->sock will be released by fput() */
1369 fput(queue->sock->file);
1371 memalloc_noreclaim_restore(noreclaim_flag);
1374 mutex_destroy(&queue->send_mutex);
1375 mutex_destroy(&queue->queue_lock);
1378 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1380 struct nvme_tcp_icreq_pdu *icreq;
1381 struct nvme_tcp_icresp_pdu *icresp;
1382 char cbuf[CMSG_LEN(sizeof(char))] = {};
1384 struct msghdr msg = {};
1386 bool ctrl_hdgst, ctrl_ddgst;
1390 icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1394 icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1400 icreq->hdr.type = nvme_tcp_icreq;
1401 icreq->hdr.hlen = sizeof(*icreq);
1403 icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1404 icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1405 icreq->maxr2t = 0; /* single inflight r2t supported */
1406 icreq->hpda = 0; /* no alignment constraint */
1407 if (queue->hdr_digest)
1408 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1409 if (queue->data_digest)
1410 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1412 iov.iov_base = icreq;
1413 iov.iov_len = sizeof(*icreq);
1414 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1416 pr_warn("queue %d: failed to send icreq, error %d\n",
1417 nvme_tcp_queue_id(queue), ret);
1421 memset(&msg, 0, sizeof(msg));
1422 iov.iov_base = icresp;
1423 iov.iov_len = sizeof(*icresp);
1424 if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
1425 msg.msg_control = cbuf;
1426 msg.msg_controllen = sizeof(cbuf);
1428 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1429 iov.iov_len, msg.msg_flags);
1431 pr_warn("queue %d: failed to receive icresp, error %d\n",
1432 nvme_tcp_queue_id(queue), ret);
1436 if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
1437 ctype = tls_get_record_type(queue->sock->sk,
1438 (struct cmsghdr *)cbuf);
1439 if (ctype != TLS_RECORD_TYPE_DATA) {
1440 pr_err("queue %d: unhandled TLS record %d\n",
1441 nvme_tcp_queue_id(queue), ctype);
1446 if (icresp->hdr.type != nvme_tcp_icresp) {
1447 pr_err("queue %d: bad type returned %d\n",
1448 nvme_tcp_queue_id(queue), icresp->hdr.type);
1452 if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1453 pr_err("queue %d: bad pdu length returned %d\n",
1454 nvme_tcp_queue_id(queue), icresp->hdr.plen);
1458 if (icresp->pfv != NVME_TCP_PFV_1_0) {
1459 pr_err("queue %d: bad pfv returned %d\n",
1460 nvme_tcp_queue_id(queue), icresp->pfv);
1464 ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1465 if ((queue->data_digest && !ctrl_ddgst) ||
1466 (!queue->data_digest && ctrl_ddgst)) {
1467 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1468 nvme_tcp_queue_id(queue),
1469 queue->data_digest ? "enabled" : "disabled",
1470 ctrl_ddgst ? "enabled" : "disabled");
1474 ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1475 if ((queue->hdr_digest && !ctrl_hdgst) ||
1476 (!queue->hdr_digest && ctrl_hdgst)) {
1477 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1478 nvme_tcp_queue_id(queue),
1479 queue->hdr_digest ? "enabled" : "disabled",
1480 ctrl_hdgst ? "enabled" : "disabled");
1484 if (icresp->cpda != 0) {
1485 pr_err("queue %d: unsupported cpda returned %d\n",
1486 nvme_tcp_queue_id(queue), icresp->cpda);
1490 maxh2cdata = le32_to_cpu(icresp->maxdata);
1491 if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
1492 pr_err("queue %d: invalid maxh2cdata returned %u\n",
1493 nvme_tcp_queue_id(queue), maxh2cdata);
1496 queue->maxh2cdata = maxh2cdata;
1506 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1508 return nvme_tcp_queue_id(queue) == 0;
1511 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1513 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1514 int qid = nvme_tcp_queue_id(queue);
1516 return !nvme_tcp_admin_queue(queue) &&
1517 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1520 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1522 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1523 int qid = nvme_tcp_queue_id(queue);
1525 return !nvme_tcp_admin_queue(queue) &&
1526 !nvme_tcp_default_queue(queue) &&
1527 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1528 ctrl->io_queues[HCTX_TYPE_READ];
1531 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1533 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1534 int qid = nvme_tcp_queue_id(queue);
1536 return !nvme_tcp_admin_queue(queue) &&
1537 !nvme_tcp_default_queue(queue) &&
1538 !nvme_tcp_read_queue(queue) &&
1539 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1540 ctrl->io_queues[HCTX_TYPE_READ] +
1541 ctrl->io_queues[HCTX_TYPE_POLL];
1544 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1546 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1547 int qid = nvme_tcp_queue_id(queue);
1550 if (nvme_tcp_default_queue(queue))
1552 else if (nvme_tcp_read_queue(queue))
1553 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1554 else if (nvme_tcp_poll_queue(queue))
1555 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1556 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1558 queue->io_cpu = WORK_CPU_UNBOUND;
1560 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1563 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
1565 struct nvme_tcp_queue *queue = data;
1566 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1567 int qid = nvme_tcp_queue_id(queue);
1568 struct key *tls_key;
1570 dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n",
1571 qid, pskid, status);
1574 queue->tls_err = -status;
1578 tls_key = key_lookup(pskid);
1579 if (IS_ERR(tls_key)) {
1580 dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n",
1582 queue->tls_err = -ENOKEY;
1584 ctrl->ctrl.tls_key = tls_key;
1589 complete(&queue->tls_complete);
1592 static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
1593 struct nvme_tcp_queue *queue,
1596 int qid = nvme_tcp_queue_id(queue);
1598 struct tls_handshake_args args;
1599 unsigned long tmo = tls_handshake_timeout * HZ;
1600 key_serial_t keyring = nvme_keyring_id();
1602 dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n",
1604 memset(&args, 0, sizeof(args));
1605 args.ta_sock = queue->sock;
1606 args.ta_done = nvme_tcp_tls_done;
1607 args.ta_data = queue;
1608 args.ta_my_peerids[0] = pskid;
1609 args.ta_num_peerids = 1;
1610 if (nctrl->opts->keyring)
1611 keyring = key_serial(nctrl->opts->keyring);
1612 args.ta_keyring = keyring;
1613 args.ta_timeout_ms = tls_handshake_timeout * 1000;
1614 queue->tls_err = -EOPNOTSUPP;
1615 init_completion(&queue->tls_complete);
1616 ret = tls_client_hello_psk(&args, GFP_KERNEL);
1618 dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n",
1622 ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo);
1627 dev_err(nctrl->device,
1628 "queue %d: TLS handshake failed, error %d\n",
1630 tls_handshake_cancel(queue->sock->sk);
1632 dev_dbg(nctrl->device,
1633 "queue %d: TLS handshake complete, error %d\n",
1634 qid, queue->tls_err);
1635 ret = queue->tls_err;
1640 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
1643 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1644 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1645 int ret, rcv_pdu_size;
1646 struct file *sock_file;
1648 mutex_init(&queue->queue_lock);
1650 init_llist_head(&queue->req_list);
1651 INIT_LIST_HEAD(&queue->send_list);
1652 mutex_init(&queue->send_mutex);
1653 INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1656 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1658 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1659 NVME_TCP_ADMIN_CCSZ;
1661 ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1662 IPPROTO_TCP, &queue->sock);
1664 dev_err(nctrl->device,
1665 "failed to create socket: %d\n", ret);
1666 goto err_destroy_mutex;
1669 sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL);
1670 if (IS_ERR(sock_file)) {
1671 ret = PTR_ERR(sock_file);
1672 goto err_destroy_mutex;
1674 nvme_tcp_reclassify_socket(queue->sock);
1676 /* Single syn retry */
1677 tcp_sock_set_syncnt(queue->sock->sk, 1);
1679 /* Set TCP no delay */
1680 tcp_sock_set_nodelay(queue->sock->sk);
1683 * Cleanup whatever is sitting in the TCP transmit queue on socket
1684 * close. This is done to prevent stale data from being sent should
1685 * the network connection be restored before TCP times out.
1687 sock_no_linger(queue->sock->sk);
1689 if (so_priority > 0)
1690 sock_set_priority(queue->sock->sk, so_priority);
1692 /* Set socket type of service */
1693 if (nctrl->opts->tos >= 0)
1694 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1696 /* Set 10 seconds timeout for icresp recvmsg */
1697 queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1699 queue->sock->sk->sk_allocation = GFP_ATOMIC;
1700 queue->sock->sk->sk_use_task_frag = false;
1701 nvme_tcp_set_queue_io_cpu(queue);
1702 queue->request = NULL;
1703 queue->data_remaining = 0;
1704 queue->ddgst_remaining = 0;
1705 queue->pdu_remaining = 0;
1706 queue->pdu_offset = 0;
1707 sk_set_memalloc(queue->sock->sk);
1709 if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1710 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1711 sizeof(ctrl->src_addr));
1713 dev_err(nctrl->device,
1714 "failed to bind queue %d socket %d\n",
1720 if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1721 char *iface = nctrl->opts->host_iface;
1722 sockptr_t optval = KERNEL_SOCKPTR(iface);
1724 ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1725 optval, strlen(iface));
1727 dev_err(nctrl->device,
1728 "failed to bind to interface %s queue %d err %d\n",
1734 queue->hdr_digest = nctrl->opts->hdr_digest;
1735 queue->data_digest = nctrl->opts->data_digest;
1736 if (queue->hdr_digest || queue->data_digest) {
1737 ret = nvme_tcp_alloc_crypto(queue);
1739 dev_err(nctrl->device,
1740 "failed to allocate queue %d crypto\n", qid);
1745 rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1746 nvme_tcp_hdgst_len(queue);
1747 queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1753 dev_dbg(nctrl->device, "connecting queue %d\n",
1754 nvme_tcp_queue_id(queue));
1756 ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1757 sizeof(ctrl->addr), 0);
1759 dev_err(nctrl->device,
1760 "failed to connect socket: %d\n", ret);
1764 /* If PSKs are configured try to start TLS */
1765 if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) {
1766 ret = nvme_tcp_start_tls(nctrl, queue, pskid);
1768 goto err_init_connect;
1771 ret = nvme_tcp_init_connection(queue);
1773 goto err_init_connect;
1775 set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1780 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1784 if (queue->hdr_digest || queue->data_digest)
1785 nvme_tcp_free_crypto(queue);
1787 /* ->sock will be released by fput() */
1788 fput(queue->sock->file);
1791 mutex_destroy(&queue->send_mutex);
1792 mutex_destroy(&queue->queue_lock);
1796 static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
1798 struct socket *sock = queue->sock;
1800 write_lock_bh(&sock->sk->sk_callback_lock);
1801 sock->sk->sk_user_data = NULL;
1802 sock->sk->sk_data_ready = queue->data_ready;
1803 sock->sk->sk_state_change = queue->state_change;
1804 sock->sk->sk_write_space = queue->write_space;
1805 write_unlock_bh(&sock->sk->sk_callback_lock);
1808 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1810 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1811 nvme_tcp_restore_sock_ops(queue);
1812 cancel_work_sync(&queue->io_work);
1815 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1817 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1818 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1820 if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1823 mutex_lock(&queue->queue_lock);
1824 if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1825 __nvme_tcp_stop_queue(queue);
1826 mutex_unlock(&queue->queue_lock);
1829 static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
1831 write_lock_bh(&queue->sock->sk->sk_callback_lock);
1832 queue->sock->sk->sk_user_data = queue;
1833 queue->state_change = queue->sock->sk->sk_state_change;
1834 queue->data_ready = queue->sock->sk->sk_data_ready;
1835 queue->write_space = queue->sock->sk->sk_write_space;
1836 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1837 queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1838 queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1839 #ifdef CONFIG_NET_RX_BUSY_POLL
1840 queue->sock->sk->sk_ll_usec = 1;
1842 write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1845 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1847 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1848 struct nvme_tcp_queue *queue = &ctrl->queues[idx];
1851 queue->rd_enabled = true;
1852 nvme_tcp_init_recv_ctx(queue);
1853 nvme_tcp_setup_sock_ops(queue);
1856 ret = nvmf_connect_io_queue(nctrl, idx);
1858 ret = nvmf_connect_admin_queue(nctrl);
1861 set_bit(NVME_TCP_Q_LIVE, &queue->flags);
1863 if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1864 __nvme_tcp_stop_queue(queue);
1865 dev_err(nctrl->device,
1866 "failed to connect queue: %d ret=%d\n", idx, ret);
1871 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1873 if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1874 cancel_work_sync(&ctrl->async_event_work);
1875 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1876 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1879 nvme_tcp_free_queue(ctrl, 0);
1882 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1886 for (i = 1; i < ctrl->queue_count; i++)
1887 nvme_tcp_free_queue(ctrl, i);
1890 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1894 for (i = 1; i < ctrl->queue_count; i++)
1895 nvme_tcp_stop_queue(ctrl, i);
1898 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
1899 int first, int last)
1903 for (i = first; i < last; i++) {
1904 ret = nvme_tcp_start_queue(ctrl, i);
1906 goto out_stop_queues;
1912 for (i--; i >= first; i--)
1913 nvme_tcp_stop_queue(ctrl, i);
1917 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1920 key_serial_t pskid = 0;
1922 if (nvme_tcp_tls(ctrl)) {
1923 if (ctrl->opts->tls_key)
1924 pskid = key_serial(ctrl->opts->tls_key);
1926 pskid = nvme_tls_psk_default(ctrl->opts->keyring,
1927 ctrl->opts->host->nqn,
1928 ctrl->opts->subsysnqn);
1930 dev_err(ctrl->device, "no valid PSK found\n");
1935 ret = nvme_tcp_alloc_queue(ctrl, 0, pskid);
1939 ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1941 goto out_free_queue;
1946 nvme_tcp_free_queue(ctrl, 0);
1950 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1954 if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) {
1955 dev_err(ctrl->device, "no PSK negotiated\n");
1958 for (i = 1; i < ctrl->queue_count; i++) {
1959 ret = nvme_tcp_alloc_queue(ctrl, i,
1960 key_serial(ctrl->tls_key));
1962 goto out_free_queues;
1968 for (i--; i >= 1; i--)
1969 nvme_tcp_free_queue(ctrl, i);
1974 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1976 unsigned int nr_io_queues;
1979 nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
1980 ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1984 if (nr_io_queues == 0) {
1985 dev_err(ctrl->device,
1986 "unable to set any I/O queues\n");
1990 ctrl->queue_count = nr_io_queues + 1;
1991 dev_info(ctrl->device,
1992 "creating %d I/O queues.\n", nr_io_queues);
1994 nvmf_set_io_queues(ctrl->opts, nr_io_queues,
1995 to_tcp_ctrl(ctrl)->io_queues);
1996 return __nvme_tcp_alloc_io_queues(ctrl);
1999 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
2001 nvme_tcp_stop_io_queues(ctrl);
2003 nvme_remove_io_tag_set(ctrl);
2004 nvme_tcp_free_io_queues(ctrl);
2007 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
2011 ret = nvme_tcp_alloc_io_queues(ctrl);
2016 ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
2018 ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
2019 sizeof(struct nvme_tcp_request));
2021 goto out_free_io_queues;
2025 * Only start IO queues for which we have allocated the tagset
2026 * and limitted it to the available queues. On reconnects, the
2027 * queue number might have changed.
2029 nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
2030 ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
2032 goto out_cleanup_connect_q;
2035 nvme_start_freeze(ctrl);
2036 nvme_unquiesce_io_queues(ctrl);
2037 if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
2039 * If we timed out waiting for freeze we are likely to
2040 * be stuck. Fail the controller initialization just
2044 nvme_unfreeze(ctrl);
2045 goto out_wait_freeze_timed_out;
2047 blk_mq_update_nr_hw_queues(ctrl->tagset,
2048 ctrl->queue_count - 1);
2049 nvme_unfreeze(ctrl);
2053 * If the number of queues has increased (reconnect case)
2054 * start all new queues now.
2056 ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
2057 ctrl->tagset->nr_hw_queues + 1);
2059 goto out_wait_freeze_timed_out;
2063 out_wait_freeze_timed_out:
2064 nvme_quiesce_io_queues(ctrl);
2065 nvme_sync_io_queues(ctrl);
2066 nvme_tcp_stop_io_queues(ctrl);
2067 out_cleanup_connect_q:
2068 nvme_cancel_tagset(ctrl);
2070 nvme_remove_io_tag_set(ctrl);
2072 nvme_tcp_free_io_queues(ctrl);
2076 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
2078 nvme_tcp_stop_queue(ctrl, 0);
2080 nvme_remove_admin_tag_set(ctrl);
2081 nvme_tcp_free_admin_queue(ctrl);
2084 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
2088 error = nvme_tcp_alloc_admin_queue(ctrl);
2093 error = nvme_alloc_admin_tag_set(ctrl,
2094 &to_tcp_ctrl(ctrl)->admin_tag_set,
2095 &nvme_tcp_admin_mq_ops,
2096 sizeof(struct nvme_tcp_request));
2098 goto out_free_queue;
2101 error = nvme_tcp_start_queue(ctrl, 0);
2103 goto out_cleanup_tagset;
2105 error = nvme_enable_ctrl(ctrl);
2107 goto out_stop_queue;
2109 nvme_unquiesce_admin_queue(ctrl);
2111 error = nvme_init_ctrl_finish(ctrl, false);
2113 goto out_quiesce_queue;
2118 nvme_quiesce_admin_queue(ctrl);
2119 blk_sync_queue(ctrl->admin_q);
2121 nvme_tcp_stop_queue(ctrl, 0);
2122 nvme_cancel_admin_tagset(ctrl);
2125 nvme_remove_admin_tag_set(ctrl);
2127 nvme_tcp_free_admin_queue(ctrl);
2131 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
2134 nvme_quiesce_admin_queue(ctrl);
2135 blk_sync_queue(ctrl->admin_q);
2136 nvme_tcp_stop_queue(ctrl, 0);
2137 nvme_cancel_admin_tagset(ctrl);
2139 nvme_unquiesce_admin_queue(ctrl);
2140 nvme_tcp_destroy_admin_queue(ctrl, remove);
2143 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
2146 if (ctrl->queue_count <= 1)
2148 nvme_quiesce_admin_queue(ctrl);
2149 nvme_quiesce_io_queues(ctrl);
2150 nvme_sync_io_queues(ctrl);
2151 nvme_tcp_stop_io_queues(ctrl);
2152 nvme_cancel_tagset(ctrl);
2154 nvme_unquiesce_io_queues(ctrl);
2155 nvme_tcp_destroy_io_queues(ctrl, remove);
2158 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
2160 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2162 /* If we are resetting/deleting then do nothing */
2163 if (state != NVME_CTRL_CONNECTING) {
2164 WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
2168 if (nvmf_should_reconnect(ctrl)) {
2169 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
2170 ctrl->opts->reconnect_delay);
2171 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
2172 ctrl->opts->reconnect_delay * HZ);
2174 dev_info(ctrl->device, "Removing controller...\n");
2175 nvme_delete_ctrl(ctrl);
2179 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
2181 struct nvmf_ctrl_options *opts = ctrl->opts;
2184 ret = nvme_tcp_configure_admin_queue(ctrl, new);
2190 dev_err(ctrl->device, "icdoff is not supported!\n");
2194 if (!nvme_ctrl_sgl_supported(ctrl)) {
2196 dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2200 if (opts->queue_size > ctrl->sqsize + 1)
2201 dev_warn(ctrl->device,
2202 "queue_size %zu > ctrl sqsize %u, clamping down\n",
2203 opts->queue_size, ctrl->sqsize + 1);
2205 if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2206 dev_warn(ctrl->device,
2207 "sqsize %u > ctrl maxcmd %u, clamping down\n",
2208 ctrl->sqsize + 1, ctrl->maxcmd);
2209 ctrl->sqsize = ctrl->maxcmd - 1;
2212 if (ctrl->queue_count > 1) {
2213 ret = nvme_tcp_configure_io_queues(ctrl, new);
2218 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2220 * state change failure is ok if we started ctrl delete,
2221 * unless we're during creation of a new controller to
2222 * avoid races with teardown flow.
2224 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2226 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2227 state != NVME_CTRL_DELETING_NOIO);
2233 nvme_start_ctrl(ctrl);
2237 if (ctrl->queue_count > 1) {
2238 nvme_quiesce_io_queues(ctrl);
2239 nvme_sync_io_queues(ctrl);
2240 nvme_tcp_stop_io_queues(ctrl);
2241 nvme_cancel_tagset(ctrl);
2242 nvme_tcp_destroy_io_queues(ctrl, new);
2245 nvme_stop_keep_alive(ctrl);
2246 nvme_tcp_teardown_admin_queue(ctrl, false);
2250 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2252 struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2253 struct nvme_tcp_ctrl, connect_work);
2254 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2256 ++ctrl->nr_reconnects;
2258 if (nvme_tcp_setup_ctrl(ctrl, false))
2261 dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2262 ctrl->nr_reconnects);
2264 ctrl->nr_reconnects = 0;
2269 dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2270 ctrl->nr_reconnects);
2271 nvme_tcp_reconnect_or_remove(ctrl);
2274 static void nvme_tcp_error_recovery_work(struct work_struct *work)
2276 struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2277 struct nvme_tcp_ctrl, err_work);
2278 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2280 nvme_stop_keep_alive(ctrl);
2281 flush_work(&ctrl->async_event_work);
2282 nvme_tcp_teardown_io_queues(ctrl, false);
2283 /* unquiesce to fail fast pending requests */
2284 nvme_unquiesce_io_queues(ctrl);
2285 nvme_tcp_teardown_admin_queue(ctrl, false);
2286 nvme_unquiesce_admin_queue(ctrl);
2287 nvme_auth_stop(ctrl);
2289 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2290 /* state change failure is ok if we started ctrl delete */
2291 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2293 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2294 state != NVME_CTRL_DELETING_NOIO);
2298 nvme_tcp_reconnect_or_remove(ctrl);
2301 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2303 nvme_tcp_teardown_io_queues(ctrl, shutdown);
2304 nvme_quiesce_admin_queue(ctrl);
2305 nvme_disable_ctrl(ctrl, shutdown);
2306 nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2309 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2311 nvme_tcp_teardown_ctrl(ctrl, true);
2314 static void nvme_reset_ctrl_work(struct work_struct *work)
2316 struct nvme_ctrl *ctrl =
2317 container_of(work, struct nvme_ctrl, reset_work);
2319 nvme_stop_ctrl(ctrl);
2320 nvme_tcp_teardown_ctrl(ctrl, false);
2322 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2323 /* state change failure is ok if we started ctrl delete */
2324 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2326 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2327 state != NVME_CTRL_DELETING_NOIO);
2331 if (nvme_tcp_setup_ctrl(ctrl, false))
2337 ++ctrl->nr_reconnects;
2338 nvme_tcp_reconnect_or_remove(ctrl);
2341 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
2343 flush_work(&to_tcp_ctrl(ctrl)->err_work);
2344 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2347 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2349 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2351 if (list_empty(&ctrl->list))
2354 mutex_lock(&nvme_tcp_ctrl_mutex);
2355 list_del(&ctrl->list);
2356 mutex_unlock(&nvme_tcp_ctrl_mutex);
2358 nvmf_free_options(nctrl->opts);
2360 kfree(ctrl->queues);
2364 static void nvme_tcp_set_sg_null(struct nvme_command *c)
2366 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2370 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2371 NVME_SGL_FMT_TRANSPORT_A;
2374 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2375 struct nvme_command *c, u32 data_len)
2377 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2379 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2380 sg->length = cpu_to_le32(data_len);
2381 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2384 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2387 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2390 sg->length = cpu_to_le32(data_len);
2391 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2392 NVME_SGL_FMT_TRANSPORT_A;
2395 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2397 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2398 struct nvme_tcp_queue *queue = &ctrl->queues[0];
2399 struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2400 struct nvme_command *cmd = &pdu->cmd;
2401 u8 hdgst = nvme_tcp_hdgst_len(queue);
2403 memset(pdu, 0, sizeof(*pdu));
2404 pdu->hdr.type = nvme_tcp_cmd;
2405 if (queue->hdr_digest)
2406 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2407 pdu->hdr.hlen = sizeof(*pdu);
2408 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2410 cmd->common.opcode = nvme_admin_async_event;
2411 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2412 cmd->common.flags |= NVME_CMD_SGL_METABUF;
2413 nvme_tcp_set_sg_null(cmd);
2415 ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2416 ctrl->async_req.offset = 0;
2417 ctrl->async_req.curr_bio = NULL;
2418 ctrl->async_req.data_len = 0;
2420 nvme_tcp_queue_request(&ctrl->async_req, true, true);
2423 static void nvme_tcp_complete_timed_out(struct request *rq)
2425 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2426 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2428 nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2429 nvmf_complete_timed_out_request(rq);
2432 static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
2434 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2435 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2436 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2437 struct nvme_command *cmd = &pdu->cmd;
2438 int qid = nvme_tcp_queue_id(req->queue);
2440 dev_warn(ctrl->device,
2441 "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
2442 rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
2443 nvme_fabrics_opcode_str(qid, cmd), qid);
2445 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
2447 * If we are resetting, connecting or deleting we should
2448 * complete immediately because we may block controller
2449 * teardown or setup sequence
2450 * - ctrl disable/shutdown fabrics requests
2451 * - connect requests
2452 * - initialization admin requests
2453 * - I/O requests that entered after unquiescing and
2454 * the controller stopped responding
2456 * All other requests should be cancelled by the error
2457 * recovery work, so it's fine that we fail it here.
2459 nvme_tcp_complete_timed_out(rq);
2464 * LIVE state should trigger the normal error recovery which will
2465 * handle completing this request.
2467 nvme_tcp_error_recovery(ctrl);
2468 return BLK_EH_RESET_TIMER;
2471 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2474 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2475 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2476 struct nvme_command *c = &pdu->cmd;
2478 c->common.flags |= NVME_CMD_SGL_METABUF;
2480 if (!blk_rq_nr_phys_segments(rq))
2481 nvme_tcp_set_sg_null(c);
2482 else if (rq_data_dir(rq) == WRITE &&
2483 req->data_len <= nvme_tcp_inline_data_size(req))
2484 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2486 nvme_tcp_set_sg_host_data(c, req->data_len);
2491 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2494 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2495 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2496 struct nvme_tcp_queue *queue = req->queue;
2497 u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2500 ret = nvme_setup_cmd(ns, rq);
2504 req->state = NVME_TCP_SEND_CMD_PDU;
2505 req->status = cpu_to_le16(NVME_SC_SUCCESS);
2510 req->h2cdata_left = 0;
2511 req->data_len = blk_rq_nr_phys_segments(rq) ?
2512 blk_rq_payload_bytes(rq) : 0;
2513 req->curr_bio = rq->bio;
2514 if (req->curr_bio && req->data_len)
2515 nvme_tcp_init_iter(req, rq_data_dir(rq));
2517 if (rq_data_dir(rq) == WRITE &&
2518 req->data_len <= nvme_tcp_inline_data_size(req))
2519 req->pdu_len = req->data_len;
2521 pdu->hdr.type = nvme_tcp_cmd;
2523 if (queue->hdr_digest)
2524 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2525 if (queue->data_digest && req->pdu_len) {
2526 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2527 ddgst = nvme_tcp_ddgst_len(queue);
2529 pdu->hdr.hlen = sizeof(*pdu);
2530 pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2532 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2534 ret = nvme_tcp_map_data(queue, rq);
2535 if (unlikely(ret)) {
2536 nvme_cleanup_cmd(rq);
2537 dev_err(queue->ctrl->ctrl.device,
2538 "Failed to map data (%d)\n", ret);
2545 static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2547 struct nvme_tcp_queue *queue = hctx->driver_data;
2549 if (!llist_empty(&queue->req_list))
2550 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2553 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2554 const struct blk_mq_queue_data *bd)
2556 struct nvme_ns *ns = hctx->queue->queuedata;
2557 struct nvme_tcp_queue *queue = hctx->driver_data;
2558 struct request *rq = bd->rq;
2559 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2560 bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2563 if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2564 return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2566 ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2570 nvme_start_request(rq);
2572 nvme_tcp_queue_request(req, true, bd->last);
2577 static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2579 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
2581 nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues);
2584 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
2586 struct nvme_tcp_queue *queue = hctx->driver_data;
2587 struct sock *sk = queue->sock->sk;
2589 if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2592 set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2593 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2594 sk_busy_loop(sk, true);
2595 nvme_tcp_try_recv(queue);
2596 clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2597 return queue->nr_cqe;
2600 static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
2602 struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
2603 struct sockaddr_storage src_addr;
2606 len = nvmf_get_address(ctrl, buf, size);
2608 mutex_lock(&queue->queue_lock);
2610 if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2612 ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr);
2615 len--; /* strip trailing newline */
2616 len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n",
2617 (len) ? "," : "", &src_addr);
2620 mutex_unlock(&queue->queue_lock);
2625 static const struct blk_mq_ops nvme_tcp_mq_ops = {
2626 .queue_rq = nvme_tcp_queue_rq,
2627 .commit_rqs = nvme_tcp_commit_rqs,
2628 .complete = nvme_complete_rq,
2629 .init_request = nvme_tcp_init_request,
2630 .exit_request = nvme_tcp_exit_request,
2631 .init_hctx = nvme_tcp_init_hctx,
2632 .timeout = nvme_tcp_timeout,
2633 .map_queues = nvme_tcp_map_queues,
2634 .poll = nvme_tcp_poll,
2637 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2638 .queue_rq = nvme_tcp_queue_rq,
2639 .complete = nvme_complete_rq,
2640 .init_request = nvme_tcp_init_request,
2641 .exit_request = nvme_tcp_exit_request,
2642 .init_hctx = nvme_tcp_init_admin_hctx,
2643 .timeout = nvme_tcp_timeout,
2646 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2648 .module = THIS_MODULE,
2649 .flags = NVME_F_FABRICS | NVME_F_BLOCKING,
2650 .reg_read32 = nvmf_reg_read32,
2651 .reg_read64 = nvmf_reg_read64,
2652 .reg_write32 = nvmf_reg_write32,
2653 .free_ctrl = nvme_tcp_free_ctrl,
2654 .submit_async_event = nvme_tcp_submit_async_event,
2655 .delete_ctrl = nvme_tcp_delete_ctrl,
2656 .get_address = nvme_tcp_get_address,
2657 .stop_ctrl = nvme_tcp_stop_ctrl,
2661 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2663 struct nvme_tcp_ctrl *ctrl;
2666 mutex_lock(&nvme_tcp_ctrl_mutex);
2667 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2668 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2672 mutex_unlock(&nvme_tcp_ctrl_mutex);
2677 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2678 struct nvmf_ctrl_options *opts)
2680 struct nvme_tcp_ctrl *ctrl;
2683 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2685 return ERR_PTR(-ENOMEM);
2687 INIT_LIST_HEAD(&ctrl->list);
2688 ctrl->ctrl.opts = opts;
2689 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2690 opts->nr_poll_queues + 1;
2691 ctrl->ctrl.sqsize = opts->queue_size - 1;
2692 ctrl->ctrl.kato = opts->kato;
2694 INIT_DELAYED_WORK(&ctrl->connect_work,
2695 nvme_tcp_reconnect_ctrl_work);
2696 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2697 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2699 if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2701 kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2702 if (!opts->trsvcid) {
2706 opts->mask |= NVMF_OPT_TRSVCID;
2709 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2710 opts->traddr, opts->trsvcid, &ctrl->addr);
2712 pr_err("malformed address passed: %s:%s\n",
2713 opts->traddr, opts->trsvcid);
2717 if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2718 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2719 opts->host_traddr, NULL, &ctrl->src_addr);
2721 pr_err("malformed src address passed: %s\n",
2727 if (opts->mask & NVMF_OPT_HOST_IFACE) {
2728 if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2729 pr_err("invalid interface passed: %s\n",
2736 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2741 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2743 if (!ctrl->queues) {
2748 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2750 goto out_kfree_queues;
2752 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2755 goto out_uninit_ctrl;
2758 ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2760 goto out_uninit_ctrl;
2762 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp, hostnqn: %s\n",
2763 nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn);
2765 mutex_lock(&nvme_tcp_ctrl_mutex);
2766 list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2767 mutex_unlock(&nvme_tcp_ctrl_mutex);
2772 nvme_uninit_ctrl(&ctrl->ctrl);
2773 nvme_put_ctrl(&ctrl->ctrl);
2776 return ERR_PTR(ret);
2778 kfree(ctrl->queues);
2781 return ERR_PTR(ret);
2784 static struct nvmf_transport_ops nvme_tcp_transport = {
2786 .module = THIS_MODULE,
2787 .required_opts = NVMF_OPT_TRADDR,
2788 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2789 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2790 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2791 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2792 NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
2793 NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY,
2794 .create_ctrl = nvme_tcp_create_ctrl,
2797 static int __init nvme_tcp_init_module(void)
2799 unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
2801 BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
2802 BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
2803 BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
2804 BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24);
2805 BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24);
2806 BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128);
2807 BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
2808 BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
2811 wq_flags |= WQ_UNBOUND;
2813 nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
2817 nvmf_register_transport(&nvme_tcp_transport);
2821 static void __exit nvme_tcp_cleanup_module(void)
2823 struct nvme_tcp_ctrl *ctrl;
2825 nvmf_unregister_transport(&nvme_tcp_transport);
2827 mutex_lock(&nvme_tcp_ctrl_mutex);
2828 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2829 nvme_delete_ctrl(&ctrl->ctrl);
2830 mutex_unlock(&nvme_tcp_ctrl_mutex);
2831 flush_workqueue(nvme_delete_wq);
2833 destroy_workqueue(nvme_tcp_wq);
2836 module_init(nvme_tcp_init_module);
2837 module_exit(nvme_tcp_cleanup_module);
2839 MODULE_DESCRIPTION("NVMe host TCP transport driver");
2840 MODULE_LICENSE("GPL v2");