Merge branch 'for-5.9/block' into for-5.9/block-merge
[linux-2.6-microblaze.git] / drivers / nvme / host / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP host.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/blk-mq.h>
15 #include <crypto/hash.h>
16 #include <net/busy_poll.h>
17
18 #include "nvme.h"
19 #include "fabrics.h"
20
21 struct nvme_tcp_queue;
22
23 /* Define the socket priority to use for connections were it is desirable
24  * that the NIC consider performing optimized packet processing or filtering.
25  * A non-zero value being sufficient to indicate general consideration of any
26  * possible optimization.  Making it a module param allows for alternative
27  * values that may be unique for some NIC implementations.
28  */
29 static int so_priority;
30 module_param(so_priority, int, 0644);
31 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
32
33 enum nvme_tcp_send_state {
34         NVME_TCP_SEND_CMD_PDU = 0,
35         NVME_TCP_SEND_H2C_PDU,
36         NVME_TCP_SEND_DATA,
37         NVME_TCP_SEND_DDGST,
38 };
39
40 struct nvme_tcp_request {
41         struct nvme_request     req;
42         void                    *pdu;
43         struct nvme_tcp_queue   *queue;
44         u32                     data_len;
45         u32                     pdu_len;
46         u32                     pdu_sent;
47         u16                     ttag;
48         struct list_head        entry;
49         __le32                  ddgst;
50
51         struct bio              *curr_bio;
52         struct iov_iter         iter;
53
54         /* send state */
55         size_t                  offset;
56         size_t                  data_sent;
57         enum nvme_tcp_send_state state;
58 };
59
60 enum nvme_tcp_queue_flags {
61         NVME_TCP_Q_ALLOCATED    = 0,
62         NVME_TCP_Q_LIVE         = 1,
63         NVME_TCP_Q_POLLING      = 2,
64 };
65
66 enum nvme_tcp_recv_state {
67         NVME_TCP_RECV_PDU = 0,
68         NVME_TCP_RECV_DATA,
69         NVME_TCP_RECV_DDGST,
70 };
71
72 struct nvme_tcp_ctrl;
73 struct nvme_tcp_queue {
74         struct socket           *sock;
75         struct work_struct      io_work;
76         int                     io_cpu;
77
78         spinlock_t              lock;
79         struct mutex            send_mutex;
80         struct list_head        send_list;
81
82         /* recv state */
83         void                    *pdu;
84         int                     pdu_remaining;
85         int                     pdu_offset;
86         size_t                  data_remaining;
87         size_t                  ddgst_remaining;
88         unsigned int            nr_cqe;
89
90         /* send state */
91         struct nvme_tcp_request *request;
92
93         int                     queue_size;
94         size_t                  cmnd_capsule_len;
95         struct nvme_tcp_ctrl    *ctrl;
96         unsigned long           flags;
97         bool                    rd_enabled;
98
99         bool                    hdr_digest;
100         bool                    data_digest;
101         struct ahash_request    *rcv_hash;
102         struct ahash_request    *snd_hash;
103         __le32                  exp_ddgst;
104         __le32                  recv_ddgst;
105
106         struct page_frag_cache  pf_cache;
107
108         void (*state_change)(struct sock *);
109         void (*data_ready)(struct sock *);
110         void (*write_space)(struct sock *);
111 };
112
113 struct nvme_tcp_ctrl {
114         /* read only in the hot path */
115         struct nvme_tcp_queue   *queues;
116         struct blk_mq_tag_set   tag_set;
117
118         /* other member variables */
119         struct list_head        list;
120         struct blk_mq_tag_set   admin_tag_set;
121         struct sockaddr_storage addr;
122         struct sockaddr_storage src_addr;
123         struct nvme_ctrl        ctrl;
124
125         struct work_struct      err_work;
126         struct delayed_work     connect_work;
127         struct nvme_tcp_request async_req;
128         u32                     io_queues[HCTX_MAX_TYPES];
129 };
130
131 static LIST_HEAD(nvme_tcp_ctrl_list);
132 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
133 static struct workqueue_struct *nvme_tcp_wq;
134 static const struct blk_mq_ops nvme_tcp_mq_ops;
135 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
136 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
137
138 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
139 {
140         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
141 }
142
143 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
144 {
145         return queue - queue->ctrl->queues;
146 }
147
148 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
149 {
150         u32 queue_idx = nvme_tcp_queue_id(queue);
151
152         if (queue_idx == 0)
153                 return queue->ctrl->admin_tag_set.tags[queue_idx];
154         return queue->ctrl->tag_set.tags[queue_idx - 1];
155 }
156
157 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
158 {
159         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
160 }
161
162 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
163 {
164         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
165 }
166
167 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
168 {
169         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
170 }
171
172 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
173 {
174         return req == &req->queue->ctrl->async_req;
175 }
176
177 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
178 {
179         struct request *rq;
180
181         if (unlikely(nvme_tcp_async_req(req)))
182                 return false; /* async events don't have a request */
183
184         rq = blk_mq_rq_from_pdu(req);
185
186         return rq_data_dir(rq) == WRITE && req->data_len &&
187                 req->data_len <= nvme_tcp_inline_data_size(req->queue);
188 }
189
190 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
191 {
192         return req->iter.bvec->bv_page;
193 }
194
195 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
196 {
197         return req->iter.bvec->bv_offset + req->iter.iov_offset;
198 }
199
200 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
201 {
202         return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
203                         req->pdu_len - req->pdu_sent);
204 }
205
206 static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
207 {
208         return req->iter.iov_offset;
209 }
210
211 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
212 {
213         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
214                         req->pdu_len - req->pdu_sent : 0;
215 }
216
217 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
218                 int len)
219 {
220         return nvme_tcp_pdu_data_left(req) <= len;
221 }
222
223 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
224                 unsigned int dir)
225 {
226         struct request *rq = blk_mq_rq_from_pdu(req);
227         struct bio_vec *vec;
228         unsigned int size;
229         int nsegs;
230         size_t offset;
231
232         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
233                 vec = &rq->special_vec;
234                 nsegs = 1;
235                 size = blk_rq_payload_bytes(rq);
236                 offset = 0;
237         } else {
238                 struct bio *bio = req->curr_bio;
239
240                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
241                 nsegs = bio_segments(bio);
242                 size = bio->bi_iter.bi_size;
243                 offset = bio->bi_iter.bi_bvec_done;
244         }
245
246         iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
247         req->iter.iov_offset = offset;
248 }
249
250 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
251                 int len)
252 {
253         req->data_sent += len;
254         req->pdu_sent += len;
255         iov_iter_advance(&req->iter, len);
256         if (!iov_iter_count(&req->iter) &&
257             req->data_sent < req->data_len) {
258                 req->curr_bio = req->curr_bio->bi_next;
259                 nvme_tcp_init_iter(req, WRITE);
260         }
261 }
262
263 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
264                 bool sync)
265 {
266         struct nvme_tcp_queue *queue = req->queue;
267         bool empty;
268
269         spin_lock(&queue->lock);
270         empty = list_empty(&queue->send_list) && !queue->request;
271         list_add_tail(&req->entry, &queue->send_list);
272         spin_unlock(&queue->lock);
273
274         /*
275          * if we're the first on the send_list and we can try to send
276          * directly, otherwise queue io_work. Also, only do that if we
277          * are on the same cpu, so we don't introduce contention.
278          */
279         if (queue->io_cpu == smp_processor_id() &&
280             sync && empty && mutex_trylock(&queue->send_mutex)) {
281                 nvme_tcp_try_send(queue);
282                 mutex_unlock(&queue->send_mutex);
283         } else {
284                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
285         }
286 }
287
288 static inline struct nvme_tcp_request *
289 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
290 {
291         struct nvme_tcp_request *req;
292
293         spin_lock(&queue->lock);
294         req = list_first_entry_or_null(&queue->send_list,
295                         struct nvme_tcp_request, entry);
296         if (req)
297                 list_del(&req->entry);
298         spin_unlock(&queue->lock);
299
300         return req;
301 }
302
303 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
304                 __le32 *dgst)
305 {
306         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
307         crypto_ahash_final(hash);
308 }
309
310 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
311                 struct page *page, off_t off, size_t len)
312 {
313         struct scatterlist sg;
314
315         sg_init_marker(&sg, 1);
316         sg_set_page(&sg, page, len, off);
317         ahash_request_set_crypt(hash, &sg, NULL, len);
318         crypto_ahash_update(hash);
319 }
320
321 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
322                 void *pdu, size_t len)
323 {
324         struct scatterlist sg;
325
326         sg_init_one(&sg, pdu, len);
327         ahash_request_set_crypt(hash, &sg, pdu + len, len);
328         crypto_ahash_digest(hash);
329 }
330
331 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
332                 void *pdu, size_t pdu_len)
333 {
334         struct nvme_tcp_hdr *hdr = pdu;
335         __le32 recv_digest;
336         __le32 exp_digest;
337
338         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
339                 dev_err(queue->ctrl->ctrl.device,
340                         "queue %d: header digest flag is cleared\n",
341                         nvme_tcp_queue_id(queue));
342                 return -EPROTO;
343         }
344
345         recv_digest = *(__le32 *)(pdu + hdr->hlen);
346         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
347         exp_digest = *(__le32 *)(pdu + hdr->hlen);
348         if (recv_digest != exp_digest) {
349                 dev_err(queue->ctrl->ctrl.device,
350                         "header digest error: recv %#x expected %#x\n",
351                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
352                 return -EIO;
353         }
354
355         return 0;
356 }
357
358 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
359 {
360         struct nvme_tcp_hdr *hdr = pdu;
361         u8 digest_len = nvme_tcp_hdgst_len(queue);
362         u32 len;
363
364         len = le32_to_cpu(hdr->plen) - hdr->hlen -
365                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
366
367         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
368                 dev_err(queue->ctrl->ctrl.device,
369                         "queue %d: data digest flag is cleared\n",
370                 nvme_tcp_queue_id(queue));
371                 return -EPROTO;
372         }
373         crypto_ahash_init(queue->rcv_hash);
374
375         return 0;
376 }
377
378 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
379                 struct request *rq, unsigned int hctx_idx)
380 {
381         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
382
383         page_frag_free(req->pdu);
384 }
385
386 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
387                 struct request *rq, unsigned int hctx_idx,
388                 unsigned int numa_node)
389 {
390         struct nvme_tcp_ctrl *ctrl = set->driver_data;
391         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
392         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
393         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
394         u8 hdgst = nvme_tcp_hdgst_len(queue);
395
396         req->pdu = page_frag_alloc(&queue->pf_cache,
397                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
398                 GFP_KERNEL | __GFP_ZERO);
399         if (!req->pdu)
400                 return -ENOMEM;
401
402         req->queue = queue;
403         nvme_req(rq)->ctrl = &ctrl->ctrl;
404
405         return 0;
406 }
407
408 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
409                 unsigned int hctx_idx)
410 {
411         struct nvme_tcp_ctrl *ctrl = data;
412         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
413
414         hctx->driver_data = queue;
415         return 0;
416 }
417
418 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
419                 unsigned int hctx_idx)
420 {
421         struct nvme_tcp_ctrl *ctrl = data;
422         struct nvme_tcp_queue *queue = &ctrl->queues[0];
423
424         hctx->driver_data = queue;
425         return 0;
426 }
427
428 static enum nvme_tcp_recv_state
429 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
430 {
431         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
432                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
433                 NVME_TCP_RECV_DATA;
434 }
435
436 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
437 {
438         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
439                                 nvme_tcp_hdgst_len(queue);
440         queue->pdu_offset = 0;
441         queue->data_remaining = -1;
442         queue->ddgst_remaining = 0;
443 }
444
445 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
446 {
447         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
448                 return;
449
450         queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
451 }
452
453 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
454                 struct nvme_completion *cqe)
455 {
456         struct request *rq;
457
458         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
459         if (!rq) {
460                 dev_err(queue->ctrl->ctrl.device,
461                         "queue %d tag 0x%x not found\n",
462                         nvme_tcp_queue_id(queue), cqe->command_id);
463                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
464                 return -EINVAL;
465         }
466
467         if (!nvme_end_request(rq, cqe->status, cqe->result))
468                 nvme_complete_rq(rq);
469         queue->nr_cqe++;
470
471         return 0;
472 }
473
474 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
475                 struct nvme_tcp_data_pdu *pdu)
476 {
477         struct request *rq;
478
479         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
480         if (!rq) {
481                 dev_err(queue->ctrl->ctrl.device,
482                         "queue %d tag %#x not found\n",
483                         nvme_tcp_queue_id(queue), pdu->command_id);
484                 return -ENOENT;
485         }
486
487         if (!blk_rq_payload_bytes(rq)) {
488                 dev_err(queue->ctrl->ctrl.device,
489                         "queue %d tag %#x unexpected data\n",
490                         nvme_tcp_queue_id(queue), rq->tag);
491                 return -EIO;
492         }
493
494         queue->data_remaining = le32_to_cpu(pdu->data_length);
495
496         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
497             unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
498                 dev_err(queue->ctrl->ctrl.device,
499                         "queue %d tag %#x SUCCESS set but not last PDU\n",
500                         nvme_tcp_queue_id(queue), rq->tag);
501                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
502                 return -EPROTO;
503         }
504
505         return 0;
506 }
507
508 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
509                 struct nvme_tcp_rsp_pdu *pdu)
510 {
511         struct nvme_completion *cqe = &pdu->cqe;
512         int ret = 0;
513
514         /*
515          * AEN requests are special as they don't time out and can
516          * survive any kind of queue freeze and often don't respond to
517          * aborts.  We don't even bother to allocate a struct request
518          * for them but rather special case them here.
519          */
520         if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
521                                      cqe->command_id)))
522                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
523                                 &cqe->result);
524         else
525                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
526
527         return ret;
528 }
529
530 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
531                 struct nvme_tcp_r2t_pdu *pdu)
532 {
533         struct nvme_tcp_data_pdu *data = req->pdu;
534         struct nvme_tcp_queue *queue = req->queue;
535         struct request *rq = blk_mq_rq_from_pdu(req);
536         u8 hdgst = nvme_tcp_hdgst_len(queue);
537         u8 ddgst = nvme_tcp_ddgst_len(queue);
538
539         req->pdu_len = le32_to_cpu(pdu->r2t_length);
540         req->pdu_sent = 0;
541
542         if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
543                 dev_err(queue->ctrl->ctrl.device,
544                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
545                         rq->tag, req->pdu_len, req->data_len,
546                         req->data_sent);
547                 return -EPROTO;
548         }
549
550         if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
551                 dev_err(queue->ctrl->ctrl.device,
552                         "req %d unexpected r2t offset %u (expected %zu)\n",
553                         rq->tag, le32_to_cpu(pdu->r2t_offset),
554                         req->data_sent);
555                 return -EPROTO;
556         }
557
558         memset(data, 0, sizeof(*data));
559         data->hdr.type = nvme_tcp_h2c_data;
560         data->hdr.flags = NVME_TCP_F_DATA_LAST;
561         if (queue->hdr_digest)
562                 data->hdr.flags |= NVME_TCP_F_HDGST;
563         if (queue->data_digest)
564                 data->hdr.flags |= NVME_TCP_F_DDGST;
565         data->hdr.hlen = sizeof(*data);
566         data->hdr.pdo = data->hdr.hlen + hdgst;
567         data->hdr.plen =
568                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
569         data->ttag = pdu->ttag;
570         data->command_id = rq->tag;
571         data->data_offset = cpu_to_le32(req->data_sent);
572         data->data_length = cpu_to_le32(req->pdu_len);
573         return 0;
574 }
575
576 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
577                 struct nvme_tcp_r2t_pdu *pdu)
578 {
579         struct nvme_tcp_request *req;
580         struct request *rq;
581         int ret;
582
583         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
584         if (!rq) {
585                 dev_err(queue->ctrl->ctrl.device,
586                         "queue %d tag %#x not found\n",
587                         nvme_tcp_queue_id(queue), pdu->command_id);
588                 return -ENOENT;
589         }
590         req = blk_mq_rq_to_pdu(rq);
591
592         ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
593         if (unlikely(ret))
594                 return ret;
595
596         req->state = NVME_TCP_SEND_H2C_PDU;
597         req->offset = 0;
598
599         nvme_tcp_queue_request(req, false);
600
601         return 0;
602 }
603
604 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
605                 unsigned int *offset, size_t *len)
606 {
607         struct nvme_tcp_hdr *hdr;
608         char *pdu = queue->pdu;
609         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
610         int ret;
611
612         ret = skb_copy_bits(skb, *offset,
613                 &pdu[queue->pdu_offset], rcv_len);
614         if (unlikely(ret))
615                 return ret;
616
617         queue->pdu_remaining -= rcv_len;
618         queue->pdu_offset += rcv_len;
619         *offset += rcv_len;
620         *len -= rcv_len;
621         if (queue->pdu_remaining)
622                 return 0;
623
624         hdr = queue->pdu;
625         if (queue->hdr_digest) {
626                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
627                 if (unlikely(ret))
628                         return ret;
629         }
630
631
632         if (queue->data_digest) {
633                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
634                 if (unlikely(ret))
635                         return ret;
636         }
637
638         switch (hdr->type) {
639         case nvme_tcp_c2h_data:
640                 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
641         case nvme_tcp_rsp:
642                 nvme_tcp_init_recv_ctx(queue);
643                 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
644         case nvme_tcp_r2t:
645                 nvme_tcp_init_recv_ctx(queue);
646                 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
647         default:
648                 dev_err(queue->ctrl->ctrl.device,
649                         "unsupported pdu type (%d)\n", hdr->type);
650                 return -EINVAL;
651         }
652 }
653
654 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
655 {
656         union nvme_result res = {};
657
658         if (!nvme_end_request(rq, cpu_to_le16(status << 1), res))
659                 nvme_complete_rq(rq);
660 }
661
662 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
663                               unsigned int *offset, size_t *len)
664 {
665         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
666         struct nvme_tcp_request *req;
667         struct request *rq;
668
669         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
670         if (!rq) {
671                 dev_err(queue->ctrl->ctrl.device,
672                         "queue %d tag %#x not found\n",
673                         nvme_tcp_queue_id(queue), pdu->command_id);
674                 return -ENOENT;
675         }
676         req = blk_mq_rq_to_pdu(rq);
677
678         while (true) {
679                 int recv_len, ret;
680
681                 recv_len = min_t(size_t, *len, queue->data_remaining);
682                 if (!recv_len)
683                         break;
684
685                 if (!iov_iter_count(&req->iter)) {
686                         req->curr_bio = req->curr_bio->bi_next;
687
688                         /*
689                          * If we don`t have any bios it means that controller
690                          * sent more data than we requested, hence error
691                          */
692                         if (!req->curr_bio) {
693                                 dev_err(queue->ctrl->ctrl.device,
694                                         "queue %d no space in request %#x",
695                                         nvme_tcp_queue_id(queue), rq->tag);
696                                 nvme_tcp_init_recv_ctx(queue);
697                                 return -EIO;
698                         }
699                         nvme_tcp_init_iter(req, READ);
700                 }
701
702                 /* we can read only from what is left in this bio */
703                 recv_len = min_t(size_t, recv_len,
704                                 iov_iter_count(&req->iter));
705
706                 if (queue->data_digest)
707                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
708                                 &req->iter, recv_len, queue->rcv_hash);
709                 else
710                         ret = skb_copy_datagram_iter(skb, *offset,
711                                         &req->iter, recv_len);
712                 if (ret) {
713                         dev_err(queue->ctrl->ctrl.device,
714                                 "queue %d failed to copy request %#x data",
715                                 nvme_tcp_queue_id(queue), rq->tag);
716                         return ret;
717                 }
718
719                 *len -= recv_len;
720                 *offset += recv_len;
721                 queue->data_remaining -= recv_len;
722         }
723
724         if (!queue->data_remaining) {
725                 if (queue->data_digest) {
726                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
727                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
728                 } else {
729                         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
730                                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
731                                 queue->nr_cqe++;
732                         }
733                         nvme_tcp_init_recv_ctx(queue);
734                 }
735         }
736
737         return 0;
738 }
739
740 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
741                 struct sk_buff *skb, unsigned int *offset, size_t *len)
742 {
743         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
744         char *ddgst = (char *)&queue->recv_ddgst;
745         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
746         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
747         int ret;
748
749         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
750         if (unlikely(ret))
751                 return ret;
752
753         queue->ddgst_remaining -= recv_len;
754         *offset += recv_len;
755         *len -= recv_len;
756         if (queue->ddgst_remaining)
757                 return 0;
758
759         if (queue->recv_ddgst != queue->exp_ddgst) {
760                 dev_err(queue->ctrl->ctrl.device,
761                         "data digest error: recv %#x expected %#x\n",
762                         le32_to_cpu(queue->recv_ddgst),
763                         le32_to_cpu(queue->exp_ddgst));
764                 return -EIO;
765         }
766
767         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
768                 struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
769                                                 pdu->command_id);
770
771                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
772                 queue->nr_cqe++;
773         }
774
775         nvme_tcp_init_recv_ctx(queue);
776         return 0;
777 }
778
779 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
780                              unsigned int offset, size_t len)
781 {
782         struct nvme_tcp_queue *queue = desc->arg.data;
783         size_t consumed = len;
784         int result;
785
786         while (len) {
787                 switch (nvme_tcp_recv_state(queue)) {
788                 case NVME_TCP_RECV_PDU:
789                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
790                         break;
791                 case NVME_TCP_RECV_DATA:
792                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
793                         break;
794                 case NVME_TCP_RECV_DDGST:
795                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
796                         break;
797                 default:
798                         result = -EFAULT;
799                 }
800                 if (result) {
801                         dev_err(queue->ctrl->ctrl.device,
802                                 "receive failed:  %d\n", result);
803                         queue->rd_enabled = false;
804                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
805                         return result;
806                 }
807         }
808
809         return consumed;
810 }
811
812 static void nvme_tcp_data_ready(struct sock *sk)
813 {
814         struct nvme_tcp_queue *queue;
815
816         read_lock_bh(&sk->sk_callback_lock);
817         queue = sk->sk_user_data;
818         if (likely(queue && queue->rd_enabled) &&
819             !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
820                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
821         read_unlock_bh(&sk->sk_callback_lock);
822 }
823
824 static void nvme_tcp_write_space(struct sock *sk)
825 {
826         struct nvme_tcp_queue *queue;
827
828         read_lock_bh(&sk->sk_callback_lock);
829         queue = sk->sk_user_data;
830         if (likely(queue && sk_stream_is_writeable(sk))) {
831                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
832                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
833         }
834         read_unlock_bh(&sk->sk_callback_lock);
835 }
836
837 static void nvme_tcp_state_change(struct sock *sk)
838 {
839         struct nvme_tcp_queue *queue;
840
841         read_lock(&sk->sk_callback_lock);
842         queue = sk->sk_user_data;
843         if (!queue)
844                 goto done;
845
846         switch (sk->sk_state) {
847         case TCP_CLOSE:
848         case TCP_CLOSE_WAIT:
849         case TCP_LAST_ACK:
850         case TCP_FIN_WAIT1:
851         case TCP_FIN_WAIT2:
852                 /* fallthrough */
853                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
854                 break;
855         default:
856                 dev_info(queue->ctrl->ctrl.device,
857                         "queue %d socket state %d\n",
858                         nvme_tcp_queue_id(queue), sk->sk_state);
859         }
860
861         queue->state_change(sk);
862 done:
863         read_unlock(&sk->sk_callback_lock);
864 }
865
866 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
867 {
868         queue->request = NULL;
869 }
870
871 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
872 {
873         nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
874 }
875
876 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
877 {
878         struct nvme_tcp_queue *queue = req->queue;
879
880         while (true) {
881                 struct page *page = nvme_tcp_req_cur_page(req);
882                 size_t offset = nvme_tcp_req_cur_offset(req);
883                 size_t len = nvme_tcp_req_cur_length(req);
884                 bool last = nvme_tcp_pdu_last_send(req, len);
885                 int ret, flags = MSG_DONTWAIT;
886
887                 if (last && !queue->data_digest)
888                         flags |= MSG_EOR;
889                 else
890                         flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
891
892                 /* can't zcopy slab pages */
893                 if (unlikely(PageSlab(page))) {
894                         ret = sock_no_sendpage(queue->sock, page, offset, len,
895                                         flags);
896                 } else {
897                         ret = kernel_sendpage(queue->sock, page, offset, len,
898                                         flags);
899                 }
900                 if (ret <= 0)
901                         return ret;
902
903                 nvme_tcp_advance_req(req, ret);
904                 if (queue->data_digest)
905                         nvme_tcp_ddgst_update(queue->snd_hash, page,
906                                         offset, ret);
907
908                 /* fully successful last write*/
909                 if (last && ret == len) {
910                         if (queue->data_digest) {
911                                 nvme_tcp_ddgst_final(queue->snd_hash,
912                                         &req->ddgst);
913                                 req->state = NVME_TCP_SEND_DDGST;
914                                 req->offset = 0;
915                         } else {
916                                 nvme_tcp_done_send_req(queue);
917                         }
918                         return 1;
919                 }
920         }
921         return -EAGAIN;
922 }
923
924 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
925 {
926         struct nvme_tcp_queue *queue = req->queue;
927         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
928         bool inline_data = nvme_tcp_has_inline_data(req);
929         u8 hdgst = nvme_tcp_hdgst_len(queue);
930         int len = sizeof(*pdu) + hdgst - req->offset;
931         int flags = MSG_DONTWAIT;
932         int ret;
933
934         if (inline_data)
935                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
936         else
937                 flags |= MSG_EOR;
938
939         if (queue->hdr_digest && !req->offset)
940                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
941
942         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
943                         offset_in_page(pdu) + req->offset, len,  flags);
944         if (unlikely(ret <= 0))
945                 return ret;
946
947         len -= ret;
948         if (!len) {
949                 if (inline_data) {
950                         req->state = NVME_TCP_SEND_DATA;
951                         if (queue->data_digest)
952                                 crypto_ahash_init(queue->snd_hash);
953                         nvme_tcp_init_iter(req, WRITE);
954                 } else {
955                         nvme_tcp_done_send_req(queue);
956                 }
957                 return 1;
958         }
959         req->offset += ret;
960
961         return -EAGAIN;
962 }
963
964 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
965 {
966         struct nvme_tcp_queue *queue = req->queue;
967         struct nvme_tcp_data_pdu *pdu = req->pdu;
968         u8 hdgst = nvme_tcp_hdgst_len(queue);
969         int len = sizeof(*pdu) - req->offset + hdgst;
970         int ret;
971
972         if (queue->hdr_digest && !req->offset)
973                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
974
975         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
976                         offset_in_page(pdu) + req->offset, len,
977                         MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
978         if (unlikely(ret <= 0))
979                 return ret;
980
981         len -= ret;
982         if (!len) {
983                 req->state = NVME_TCP_SEND_DATA;
984                 if (queue->data_digest)
985                         crypto_ahash_init(queue->snd_hash);
986                 if (!req->data_sent)
987                         nvme_tcp_init_iter(req, WRITE);
988                 return 1;
989         }
990         req->offset += ret;
991
992         return -EAGAIN;
993 }
994
995 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
996 {
997         struct nvme_tcp_queue *queue = req->queue;
998         int ret;
999         struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
1000         struct kvec iov = {
1001                 .iov_base = &req->ddgst + req->offset,
1002                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1003         };
1004
1005         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1006         if (unlikely(ret <= 0))
1007                 return ret;
1008
1009         if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
1010                 nvme_tcp_done_send_req(queue);
1011                 return 1;
1012         }
1013
1014         req->offset += ret;
1015         return -EAGAIN;
1016 }
1017
1018 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1019 {
1020         struct nvme_tcp_request *req;
1021         int ret = 1;
1022
1023         if (!queue->request) {
1024                 queue->request = nvme_tcp_fetch_request(queue);
1025                 if (!queue->request)
1026                         return 0;
1027         }
1028         req = queue->request;
1029
1030         if (req->state == NVME_TCP_SEND_CMD_PDU) {
1031                 ret = nvme_tcp_try_send_cmd_pdu(req);
1032                 if (ret <= 0)
1033                         goto done;
1034                 if (!nvme_tcp_has_inline_data(req))
1035                         return ret;
1036         }
1037
1038         if (req->state == NVME_TCP_SEND_H2C_PDU) {
1039                 ret = nvme_tcp_try_send_data_pdu(req);
1040                 if (ret <= 0)
1041                         goto done;
1042         }
1043
1044         if (req->state == NVME_TCP_SEND_DATA) {
1045                 ret = nvme_tcp_try_send_data(req);
1046                 if (ret <= 0)
1047                         goto done;
1048         }
1049
1050         if (req->state == NVME_TCP_SEND_DDGST)
1051                 ret = nvme_tcp_try_send_ddgst(req);
1052 done:
1053         if (ret == -EAGAIN) {
1054                 ret = 0;
1055         } else if (ret < 0) {
1056                 dev_err(queue->ctrl->ctrl.device,
1057                         "failed to send request %d\n", ret);
1058                 if (ret != -EPIPE && ret != -ECONNRESET)
1059                         nvme_tcp_fail_request(queue->request);
1060                 nvme_tcp_done_send_req(queue);
1061         }
1062         return ret;
1063 }
1064
1065 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1066 {
1067         struct socket *sock = queue->sock;
1068         struct sock *sk = sock->sk;
1069         read_descriptor_t rd_desc;
1070         int consumed;
1071
1072         rd_desc.arg.data = queue;
1073         rd_desc.count = 1;
1074         lock_sock(sk);
1075         queue->nr_cqe = 0;
1076         consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1077         release_sock(sk);
1078         return consumed;
1079 }
1080
1081 static void nvme_tcp_io_work(struct work_struct *w)
1082 {
1083         struct nvme_tcp_queue *queue =
1084                 container_of(w, struct nvme_tcp_queue, io_work);
1085         unsigned long deadline = jiffies + msecs_to_jiffies(1);
1086
1087         do {
1088                 bool pending = false;
1089                 int result;
1090
1091                 if (mutex_trylock(&queue->send_mutex)) {
1092                         result = nvme_tcp_try_send(queue);
1093                         mutex_unlock(&queue->send_mutex);
1094                         if (result > 0)
1095                                 pending = true;
1096                         else if (unlikely(result < 0))
1097                                 break;
1098                 }
1099
1100                 result = nvme_tcp_try_recv(queue);
1101                 if (result > 0)
1102                         pending = true;
1103                 else if (unlikely(result < 0))
1104                         return;
1105
1106                 if (!pending)
1107                         return;
1108
1109         } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1110
1111         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1112 }
1113
1114 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1115 {
1116         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1117
1118         ahash_request_free(queue->rcv_hash);
1119         ahash_request_free(queue->snd_hash);
1120         crypto_free_ahash(tfm);
1121 }
1122
1123 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1124 {
1125         struct crypto_ahash *tfm;
1126
1127         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1128         if (IS_ERR(tfm))
1129                 return PTR_ERR(tfm);
1130
1131         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1132         if (!queue->snd_hash)
1133                 goto free_tfm;
1134         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1135
1136         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1137         if (!queue->rcv_hash)
1138                 goto free_snd_hash;
1139         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1140
1141         return 0;
1142 free_snd_hash:
1143         ahash_request_free(queue->snd_hash);
1144 free_tfm:
1145         crypto_free_ahash(tfm);
1146         return -ENOMEM;
1147 }
1148
1149 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1150 {
1151         struct nvme_tcp_request *async = &ctrl->async_req;
1152
1153         page_frag_free(async->pdu);
1154 }
1155
1156 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1157 {
1158         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1159         struct nvme_tcp_request *async = &ctrl->async_req;
1160         u8 hdgst = nvme_tcp_hdgst_len(queue);
1161
1162         async->pdu = page_frag_alloc(&queue->pf_cache,
1163                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1164                 GFP_KERNEL | __GFP_ZERO);
1165         if (!async->pdu)
1166                 return -ENOMEM;
1167
1168         async->queue = &ctrl->queues[0];
1169         return 0;
1170 }
1171
1172 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1173 {
1174         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1175         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1176
1177         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1178                 return;
1179
1180         if (queue->hdr_digest || queue->data_digest)
1181                 nvme_tcp_free_crypto(queue);
1182
1183         sock_release(queue->sock);
1184         kfree(queue->pdu);
1185 }
1186
1187 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1188 {
1189         struct nvme_tcp_icreq_pdu *icreq;
1190         struct nvme_tcp_icresp_pdu *icresp;
1191         struct msghdr msg = {};
1192         struct kvec iov;
1193         bool ctrl_hdgst, ctrl_ddgst;
1194         int ret;
1195
1196         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1197         if (!icreq)
1198                 return -ENOMEM;
1199
1200         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1201         if (!icresp) {
1202                 ret = -ENOMEM;
1203                 goto free_icreq;
1204         }
1205
1206         icreq->hdr.type = nvme_tcp_icreq;
1207         icreq->hdr.hlen = sizeof(*icreq);
1208         icreq->hdr.pdo = 0;
1209         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1210         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1211         icreq->maxr2t = 0; /* single inflight r2t supported */
1212         icreq->hpda = 0; /* no alignment constraint */
1213         if (queue->hdr_digest)
1214                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1215         if (queue->data_digest)
1216                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1217
1218         iov.iov_base = icreq;
1219         iov.iov_len = sizeof(*icreq);
1220         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1221         if (ret < 0)
1222                 goto free_icresp;
1223
1224         memset(&msg, 0, sizeof(msg));
1225         iov.iov_base = icresp;
1226         iov.iov_len = sizeof(*icresp);
1227         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1228                         iov.iov_len, msg.msg_flags);
1229         if (ret < 0)
1230                 goto free_icresp;
1231
1232         ret = -EINVAL;
1233         if (icresp->hdr.type != nvme_tcp_icresp) {
1234                 pr_err("queue %d: bad type returned %d\n",
1235                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1236                 goto free_icresp;
1237         }
1238
1239         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1240                 pr_err("queue %d: bad pdu length returned %d\n",
1241                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1242                 goto free_icresp;
1243         }
1244
1245         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1246                 pr_err("queue %d: bad pfv returned %d\n",
1247                         nvme_tcp_queue_id(queue), icresp->pfv);
1248                 goto free_icresp;
1249         }
1250
1251         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1252         if ((queue->data_digest && !ctrl_ddgst) ||
1253             (!queue->data_digest && ctrl_ddgst)) {
1254                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1255                         nvme_tcp_queue_id(queue),
1256                         queue->data_digest ? "enabled" : "disabled",
1257                         ctrl_ddgst ? "enabled" : "disabled");
1258                 goto free_icresp;
1259         }
1260
1261         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1262         if ((queue->hdr_digest && !ctrl_hdgst) ||
1263             (!queue->hdr_digest && ctrl_hdgst)) {
1264                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1265                         nvme_tcp_queue_id(queue),
1266                         queue->hdr_digest ? "enabled" : "disabled",
1267                         ctrl_hdgst ? "enabled" : "disabled");
1268                 goto free_icresp;
1269         }
1270
1271         if (icresp->cpda != 0) {
1272                 pr_err("queue %d: unsupported cpda returned %d\n",
1273                         nvme_tcp_queue_id(queue), icresp->cpda);
1274                 goto free_icresp;
1275         }
1276
1277         ret = 0;
1278 free_icresp:
1279         kfree(icresp);
1280 free_icreq:
1281         kfree(icreq);
1282         return ret;
1283 }
1284
1285 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1286 {
1287         return nvme_tcp_queue_id(queue) == 0;
1288 }
1289
1290 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1291 {
1292         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1293         int qid = nvme_tcp_queue_id(queue);
1294
1295         return !nvme_tcp_admin_queue(queue) &&
1296                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1297 }
1298
1299 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1300 {
1301         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1302         int qid = nvme_tcp_queue_id(queue);
1303
1304         return !nvme_tcp_admin_queue(queue) &&
1305                 !nvme_tcp_default_queue(queue) &&
1306                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1307                           ctrl->io_queues[HCTX_TYPE_READ];
1308 }
1309
1310 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1311 {
1312         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1313         int qid = nvme_tcp_queue_id(queue);
1314
1315         return !nvme_tcp_admin_queue(queue) &&
1316                 !nvme_tcp_default_queue(queue) &&
1317                 !nvme_tcp_read_queue(queue) &&
1318                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1319                           ctrl->io_queues[HCTX_TYPE_READ] +
1320                           ctrl->io_queues[HCTX_TYPE_POLL];
1321 }
1322
1323 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1324 {
1325         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1326         int qid = nvme_tcp_queue_id(queue);
1327         int n = 0;
1328
1329         if (nvme_tcp_default_queue(queue))
1330                 n = qid - 1;
1331         else if (nvme_tcp_read_queue(queue))
1332                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1333         else if (nvme_tcp_poll_queue(queue))
1334                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1335                                 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1336         queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1337 }
1338
1339 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1340                 int qid, size_t queue_size)
1341 {
1342         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1343         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1344         int ret, rcv_pdu_size;
1345
1346         queue->ctrl = ctrl;
1347         INIT_LIST_HEAD(&queue->send_list);
1348         spin_lock_init(&queue->lock);
1349         mutex_init(&queue->send_mutex);
1350         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1351         queue->queue_size = queue_size;
1352
1353         if (qid > 0)
1354                 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1355         else
1356                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1357                                                 NVME_TCP_ADMIN_CCSZ;
1358
1359         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1360                         IPPROTO_TCP, &queue->sock);
1361         if (ret) {
1362                 dev_err(nctrl->device,
1363                         "failed to create socket: %d\n", ret);
1364                 return ret;
1365         }
1366
1367         /* Single syn retry */
1368         tcp_sock_set_syncnt(queue->sock->sk, 1);
1369
1370         /* Set TCP no delay */
1371         tcp_sock_set_nodelay(queue->sock->sk);
1372
1373         /*
1374          * Cleanup whatever is sitting in the TCP transmit queue on socket
1375          * close. This is done to prevent stale data from being sent should
1376          * the network connection be restored before TCP times out.
1377          */
1378         sock_no_linger(queue->sock->sk);
1379
1380         if (so_priority > 0)
1381                 sock_set_priority(queue->sock->sk, so_priority);
1382
1383         /* Set socket type of service */
1384         if (nctrl->opts->tos >= 0)
1385                 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1386
1387         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1388         nvme_tcp_set_queue_io_cpu(queue);
1389         queue->request = NULL;
1390         queue->data_remaining = 0;
1391         queue->ddgst_remaining = 0;
1392         queue->pdu_remaining = 0;
1393         queue->pdu_offset = 0;
1394         sk_set_memalloc(queue->sock->sk);
1395
1396         if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1397                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1398                         sizeof(ctrl->src_addr));
1399                 if (ret) {
1400                         dev_err(nctrl->device,
1401                                 "failed to bind queue %d socket %d\n",
1402                                 qid, ret);
1403                         goto err_sock;
1404                 }
1405         }
1406
1407         queue->hdr_digest = nctrl->opts->hdr_digest;
1408         queue->data_digest = nctrl->opts->data_digest;
1409         if (queue->hdr_digest || queue->data_digest) {
1410                 ret = nvme_tcp_alloc_crypto(queue);
1411                 if (ret) {
1412                         dev_err(nctrl->device,
1413                                 "failed to allocate queue %d crypto\n", qid);
1414                         goto err_sock;
1415                 }
1416         }
1417
1418         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1419                         nvme_tcp_hdgst_len(queue);
1420         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1421         if (!queue->pdu) {
1422                 ret = -ENOMEM;
1423                 goto err_crypto;
1424         }
1425
1426         dev_dbg(nctrl->device, "connecting queue %d\n",
1427                         nvme_tcp_queue_id(queue));
1428
1429         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1430                 sizeof(ctrl->addr), 0);
1431         if (ret) {
1432                 dev_err(nctrl->device,
1433                         "failed to connect socket: %d\n", ret);
1434                 goto err_rcv_pdu;
1435         }
1436
1437         ret = nvme_tcp_init_connection(queue);
1438         if (ret)
1439                 goto err_init_connect;
1440
1441         queue->rd_enabled = true;
1442         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1443         nvme_tcp_init_recv_ctx(queue);
1444
1445         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1446         queue->sock->sk->sk_user_data = queue;
1447         queue->state_change = queue->sock->sk->sk_state_change;
1448         queue->data_ready = queue->sock->sk->sk_data_ready;
1449         queue->write_space = queue->sock->sk->sk_write_space;
1450         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1451         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1452         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1453 #ifdef CONFIG_NET_RX_BUSY_POLL
1454         queue->sock->sk->sk_ll_usec = 1;
1455 #endif
1456         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1457
1458         return 0;
1459
1460 err_init_connect:
1461         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1462 err_rcv_pdu:
1463         kfree(queue->pdu);
1464 err_crypto:
1465         if (queue->hdr_digest || queue->data_digest)
1466                 nvme_tcp_free_crypto(queue);
1467 err_sock:
1468         sock_release(queue->sock);
1469         queue->sock = NULL;
1470         return ret;
1471 }
1472
1473 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1474 {
1475         struct socket *sock = queue->sock;
1476
1477         write_lock_bh(&sock->sk->sk_callback_lock);
1478         sock->sk->sk_user_data  = NULL;
1479         sock->sk->sk_data_ready = queue->data_ready;
1480         sock->sk->sk_state_change = queue->state_change;
1481         sock->sk->sk_write_space  = queue->write_space;
1482         write_unlock_bh(&sock->sk->sk_callback_lock);
1483 }
1484
1485 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1486 {
1487         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1488         nvme_tcp_restore_sock_calls(queue);
1489         cancel_work_sync(&queue->io_work);
1490 }
1491
1492 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1493 {
1494         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1495         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1496
1497         if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1498                 return;
1499
1500         __nvme_tcp_stop_queue(queue);
1501 }
1502
1503 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1504 {
1505         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1506         int ret;
1507
1508         if (idx)
1509                 ret = nvmf_connect_io_queue(nctrl, idx, false);
1510         else
1511                 ret = nvmf_connect_admin_queue(nctrl);
1512
1513         if (!ret) {
1514                 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1515         } else {
1516                 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1517                         __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1518                 dev_err(nctrl->device,
1519                         "failed to connect queue: %d ret=%d\n", idx, ret);
1520         }
1521         return ret;
1522 }
1523
1524 static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1525                 bool admin)
1526 {
1527         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1528         struct blk_mq_tag_set *set;
1529         int ret;
1530
1531         if (admin) {
1532                 set = &ctrl->admin_tag_set;
1533                 memset(set, 0, sizeof(*set));
1534                 set->ops = &nvme_tcp_admin_mq_ops;
1535                 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1536                 set->reserved_tags = 2; /* connect + keep-alive */
1537                 set->numa_node = nctrl->numa_node;
1538                 set->flags = BLK_MQ_F_BLOCKING;
1539                 set->cmd_size = sizeof(struct nvme_tcp_request);
1540                 set->driver_data = ctrl;
1541                 set->nr_hw_queues = 1;
1542                 set->timeout = ADMIN_TIMEOUT;
1543         } else {
1544                 set = &ctrl->tag_set;
1545                 memset(set, 0, sizeof(*set));
1546                 set->ops = &nvme_tcp_mq_ops;
1547                 set->queue_depth = nctrl->sqsize + 1;
1548                 set->reserved_tags = 1; /* fabric connect */
1549                 set->numa_node = nctrl->numa_node;
1550                 set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1551                 set->cmd_size = sizeof(struct nvme_tcp_request);
1552                 set->driver_data = ctrl;
1553                 set->nr_hw_queues = nctrl->queue_count - 1;
1554                 set->timeout = NVME_IO_TIMEOUT;
1555                 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1556         }
1557
1558         ret = blk_mq_alloc_tag_set(set);
1559         if (ret)
1560                 return ERR_PTR(ret);
1561
1562         return set;
1563 }
1564
1565 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1566 {
1567         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1568                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1569                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1570         }
1571
1572         nvme_tcp_free_queue(ctrl, 0);
1573 }
1574
1575 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1576 {
1577         int i;
1578
1579         for (i = 1; i < ctrl->queue_count; i++)
1580                 nvme_tcp_free_queue(ctrl, i);
1581 }
1582
1583 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1584 {
1585         int i;
1586
1587         for (i = 1; i < ctrl->queue_count; i++)
1588                 nvme_tcp_stop_queue(ctrl, i);
1589 }
1590
1591 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1592 {
1593         int i, ret = 0;
1594
1595         for (i = 1; i < ctrl->queue_count; i++) {
1596                 ret = nvme_tcp_start_queue(ctrl, i);
1597                 if (ret)
1598                         goto out_stop_queues;
1599         }
1600
1601         return 0;
1602
1603 out_stop_queues:
1604         for (i--; i >= 1; i--)
1605                 nvme_tcp_stop_queue(ctrl, i);
1606         return ret;
1607 }
1608
1609 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1610 {
1611         int ret;
1612
1613         ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1614         if (ret)
1615                 return ret;
1616
1617         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1618         if (ret)
1619                 goto out_free_queue;
1620
1621         return 0;
1622
1623 out_free_queue:
1624         nvme_tcp_free_queue(ctrl, 0);
1625         return ret;
1626 }
1627
1628 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1629 {
1630         int i, ret;
1631
1632         for (i = 1; i < ctrl->queue_count; i++) {
1633                 ret = nvme_tcp_alloc_queue(ctrl, i,
1634                                 ctrl->sqsize + 1);
1635                 if (ret)
1636                         goto out_free_queues;
1637         }
1638
1639         return 0;
1640
1641 out_free_queues:
1642         for (i--; i >= 1; i--)
1643                 nvme_tcp_free_queue(ctrl, i);
1644
1645         return ret;
1646 }
1647
1648 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1649 {
1650         unsigned int nr_io_queues;
1651
1652         nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1653         nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1654         nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1655
1656         return nr_io_queues;
1657 }
1658
1659 static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1660                 unsigned int nr_io_queues)
1661 {
1662         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1663         struct nvmf_ctrl_options *opts = nctrl->opts;
1664
1665         if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1666                 /*
1667                  * separate read/write queues
1668                  * hand out dedicated default queues only after we have
1669                  * sufficient read queues.
1670                  */
1671                 ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1672                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1673                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1674                         min(opts->nr_write_queues, nr_io_queues);
1675                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1676         } else {
1677                 /*
1678                  * shared read/write queues
1679                  * either no write queues were requested, or we don't have
1680                  * sufficient queue count to have dedicated default queues.
1681                  */
1682                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1683                         min(opts->nr_io_queues, nr_io_queues);
1684                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1685         }
1686
1687         if (opts->nr_poll_queues && nr_io_queues) {
1688                 /* map dedicated poll queues only if we have queues left */
1689                 ctrl->io_queues[HCTX_TYPE_POLL] =
1690                         min(opts->nr_poll_queues, nr_io_queues);
1691         }
1692 }
1693
1694 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1695 {
1696         unsigned int nr_io_queues;
1697         int ret;
1698
1699         nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1700         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1701         if (ret)
1702                 return ret;
1703
1704         ctrl->queue_count = nr_io_queues + 1;
1705         if (ctrl->queue_count < 2)
1706                 return 0;
1707
1708         dev_info(ctrl->device,
1709                 "creating %d I/O queues.\n", nr_io_queues);
1710
1711         nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1712
1713         return __nvme_tcp_alloc_io_queues(ctrl);
1714 }
1715
1716 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1717 {
1718         nvme_tcp_stop_io_queues(ctrl);
1719         if (remove) {
1720                 blk_cleanup_queue(ctrl->connect_q);
1721                 blk_mq_free_tag_set(ctrl->tagset);
1722         }
1723         nvme_tcp_free_io_queues(ctrl);
1724 }
1725
1726 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1727 {
1728         int ret;
1729
1730         ret = nvme_tcp_alloc_io_queues(ctrl);
1731         if (ret)
1732                 return ret;
1733
1734         if (new) {
1735                 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1736                 if (IS_ERR(ctrl->tagset)) {
1737                         ret = PTR_ERR(ctrl->tagset);
1738                         goto out_free_io_queues;
1739                 }
1740
1741                 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1742                 if (IS_ERR(ctrl->connect_q)) {
1743                         ret = PTR_ERR(ctrl->connect_q);
1744                         goto out_free_tag_set;
1745                 }
1746         } else {
1747                 blk_mq_update_nr_hw_queues(ctrl->tagset,
1748                         ctrl->queue_count - 1);
1749         }
1750
1751         ret = nvme_tcp_start_io_queues(ctrl);
1752         if (ret)
1753                 goto out_cleanup_connect_q;
1754
1755         return 0;
1756
1757 out_cleanup_connect_q:
1758         if (new)
1759                 blk_cleanup_queue(ctrl->connect_q);
1760 out_free_tag_set:
1761         if (new)
1762                 blk_mq_free_tag_set(ctrl->tagset);
1763 out_free_io_queues:
1764         nvme_tcp_free_io_queues(ctrl);
1765         return ret;
1766 }
1767
1768 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1769 {
1770         nvme_tcp_stop_queue(ctrl, 0);
1771         if (remove) {
1772                 blk_cleanup_queue(ctrl->admin_q);
1773                 blk_cleanup_queue(ctrl->fabrics_q);
1774                 blk_mq_free_tag_set(ctrl->admin_tagset);
1775         }
1776         nvme_tcp_free_admin_queue(ctrl);
1777 }
1778
1779 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1780 {
1781         int error;
1782
1783         error = nvme_tcp_alloc_admin_queue(ctrl);
1784         if (error)
1785                 return error;
1786
1787         if (new) {
1788                 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1789                 if (IS_ERR(ctrl->admin_tagset)) {
1790                         error = PTR_ERR(ctrl->admin_tagset);
1791                         goto out_free_queue;
1792                 }
1793
1794                 ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1795                 if (IS_ERR(ctrl->fabrics_q)) {
1796                         error = PTR_ERR(ctrl->fabrics_q);
1797                         goto out_free_tagset;
1798                 }
1799
1800                 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1801                 if (IS_ERR(ctrl->admin_q)) {
1802                         error = PTR_ERR(ctrl->admin_q);
1803                         goto out_cleanup_fabrics_q;
1804                 }
1805         }
1806
1807         error = nvme_tcp_start_queue(ctrl, 0);
1808         if (error)
1809                 goto out_cleanup_queue;
1810
1811         error = nvme_enable_ctrl(ctrl);
1812         if (error)
1813                 goto out_stop_queue;
1814
1815         blk_mq_unquiesce_queue(ctrl->admin_q);
1816
1817         error = nvme_init_identify(ctrl);
1818         if (error)
1819                 goto out_stop_queue;
1820
1821         return 0;
1822
1823 out_stop_queue:
1824         nvme_tcp_stop_queue(ctrl, 0);
1825 out_cleanup_queue:
1826         if (new)
1827                 blk_cleanup_queue(ctrl->admin_q);
1828 out_cleanup_fabrics_q:
1829         if (new)
1830                 blk_cleanup_queue(ctrl->fabrics_q);
1831 out_free_tagset:
1832         if (new)
1833                 blk_mq_free_tag_set(ctrl->admin_tagset);
1834 out_free_queue:
1835         nvme_tcp_free_admin_queue(ctrl);
1836         return error;
1837 }
1838
1839 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1840                 bool remove)
1841 {
1842         blk_mq_quiesce_queue(ctrl->admin_q);
1843         nvme_tcp_stop_queue(ctrl, 0);
1844         if (ctrl->admin_tagset) {
1845                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
1846                         nvme_cancel_request, ctrl);
1847                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
1848         }
1849         if (remove)
1850                 blk_mq_unquiesce_queue(ctrl->admin_q);
1851         nvme_tcp_destroy_admin_queue(ctrl, remove);
1852 }
1853
1854 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1855                 bool remove)
1856 {
1857         if (ctrl->queue_count <= 1)
1858                 return;
1859         nvme_stop_queues(ctrl);
1860         nvme_tcp_stop_io_queues(ctrl);
1861         if (ctrl->tagset) {
1862                 blk_mq_tagset_busy_iter(ctrl->tagset,
1863                         nvme_cancel_request, ctrl);
1864                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
1865         }
1866         if (remove)
1867                 nvme_start_queues(ctrl);
1868         nvme_tcp_destroy_io_queues(ctrl, remove);
1869 }
1870
1871 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1872 {
1873         /* If we are resetting/deleting then do nothing */
1874         if (ctrl->state != NVME_CTRL_CONNECTING) {
1875                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1876                         ctrl->state == NVME_CTRL_LIVE);
1877                 return;
1878         }
1879
1880         if (nvmf_should_reconnect(ctrl)) {
1881                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1882                         ctrl->opts->reconnect_delay);
1883                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1884                                 ctrl->opts->reconnect_delay * HZ);
1885         } else {
1886                 dev_info(ctrl->device, "Removing controller...\n");
1887                 nvme_delete_ctrl(ctrl);
1888         }
1889 }
1890
1891 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1892 {
1893         struct nvmf_ctrl_options *opts = ctrl->opts;
1894         int ret;
1895
1896         ret = nvme_tcp_configure_admin_queue(ctrl, new);
1897         if (ret)
1898                 return ret;
1899
1900         if (ctrl->icdoff) {
1901                 dev_err(ctrl->device, "icdoff is not supported!\n");
1902                 goto destroy_admin;
1903         }
1904
1905         if (opts->queue_size > ctrl->sqsize + 1)
1906                 dev_warn(ctrl->device,
1907                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
1908                         opts->queue_size, ctrl->sqsize + 1);
1909
1910         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1911                 dev_warn(ctrl->device,
1912                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
1913                         ctrl->sqsize + 1, ctrl->maxcmd);
1914                 ctrl->sqsize = ctrl->maxcmd - 1;
1915         }
1916
1917         if (ctrl->queue_count > 1) {
1918                 ret = nvme_tcp_configure_io_queues(ctrl, new);
1919                 if (ret)
1920                         goto destroy_admin;
1921         }
1922
1923         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1924                 /*
1925                  * state change failure is ok if we're in DELETING state,
1926                  * unless we're during creation of a new controller to
1927                  * avoid races with teardown flow.
1928                  */
1929                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1930                 WARN_ON_ONCE(new);
1931                 ret = -EINVAL;
1932                 goto destroy_io;
1933         }
1934
1935         nvme_start_ctrl(ctrl);
1936         return 0;
1937
1938 destroy_io:
1939         if (ctrl->queue_count > 1)
1940                 nvme_tcp_destroy_io_queues(ctrl, new);
1941 destroy_admin:
1942         nvme_tcp_stop_queue(ctrl, 0);
1943         nvme_tcp_destroy_admin_queue(ctrl, new);
1944         return ret;
1945 }
1946
1947 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1948 {
1949         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1950                         struct nvme_tcp_ctrl, connect_work);
1951         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1952
1953         ++ctrl->nr_reconnects;
1954
1955         if (nvme_tcp_setup_ctrl(ctrl, false))
1956                 goto requeue;
1957
1958         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
1959                         ctrl->nr_reconnects);
1960
1961         ctrl->nr_reconnects = 0;
1962
1963         return;
1964
1965 requeue:
1966         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
1967                         ctrl->nr_reconnects);
1968         nvme_tcp_reconnect_or_remove(ctrl);
1969 }
1970
1971 static void nvme_tcp_error_recovery_work(struct work_struct *work)
1972 {
1973         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
1974                                 struct nvme_tcp_ctrl, err_work);
1975         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1976
1977         nvme_stop_keep_alive(ctrl);
1978         nvme_tcp_teardown_io_queues(ctrl, false);
1979         /* unquiesce to fail fast pending requests */
1980         nvme_start_queues(ctrl);
1981         nvme_tcp_teardown_admin_queue(ctrl, false);
1982         blk_mq_unquiesce_queue(ctrl->admin_q);
1983
1984         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1985                 /* state change failure is ok if we're in DELETING state */
1986                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1987                 return;
1988         }
1989
1990         nvme_tcp_reconnect_or_remove(ctrl);
1991 }
1992
1993 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
1994 {
1995         cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
1996         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
1997
1998         nvme_tcp_teardown_io_queues(ctrl, shutdown);
1999         blk_mq_quiesce_queue(ctrl->admin_q);
2000         if (shutdown)
2001                 nvme_shutdown_ctrl(ctrl);
2002         else
2003                 nvme_disable_ctrl(ctrl);
2004         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2005 }
2006
2007 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2008 {
2009         nvme_tcp_teardown_ctrl(ctrl, true);
2010 }
2011
2012 static void nvme_reset_ctrl_work(struct work_struct *work)
2013 {
2014         struct nvme_ctrl *ctrl =
2015                 container_of(work, struct nvme_ctrl, reset_work);
2016
2017         nvme_stop_ctrl(ctrl);
2018         nvme_tcp_teardown_ctrl(ctrl, false);
2019
2020         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2021                 /* state change failure is ok if we're in DELETING state */
2022                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
2023                 return;
2024         }
2025
2026         if (nvme_tcp_setup_ctrl(ctrl, false))
2027                 goto out_fail;
2028
2029         return;
2030
2031 out_fail:
2032         ++ctrl->nr_reconnects;
2033         nvme_tcp_reconnect_or_remove(ctrl);
2034 }
2035
2036 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2037 {
2038         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2039
2040         if (list_empty(&ctrl->list))
2041                 goto free_ctrl;
2042
2043         mutex_lock(&nvme_tcp_ctrl_mutex);
2044         list_del(&ctrl->list);
2045         mutex_unlock(&nvme_tcp_ctrl_mutex);
2046
2047         nvmf_free_options(nctrl->opts);
2048 free_ctrl:
2049         kfree(ctrl->queues);
2050         kfree(ctrl);
2051 }
2052
2053 static void nvme_tcp_set_sg_null(struct nvme_command *c)
2054 {
2055         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2056
2057         sg->addr = 0;
2058         sg->length = 0;
2059         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2060                         NVME_SGL_FMT_TRANSPORT_A;
2061 }
2062
2063 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2064                 struct nvme_command *c, u32 data_len)
2065 {
2066         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2067
2068         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2069         sg->length = cpu_to_le32(data_len);
2070         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2071 }
2072
2073 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2074                 u32 data_len)
2075 {
2076         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2077
2078         sg->addr = 0;
2079         sg->length = cpu_to_le32(data_len);
2080         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2081                         NVME_SGL_FMT_TRANSPORT_A;
2082 }
2083
2084 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2085 {
2086         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2087         struct nvme_tcp_queue *queue = &ctrl->queues[0];
2088         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2089         struct nvme_command *cmd = &pdu->cmd;
2090         u8 hdgst = nvme_tcp_hdgst_len(queue);
2091
2092         memset(pdu, 0, sizeof(*pdu));
2093         pdu->hdr.type = nvme_tcp_cmd;
2094         if (queue->hdr_digest)
2095                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2096         pdu->hdr.hlen = sizeof(*pdu);
2097         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2098
2099         cmd->common.opcode = nvme_admin_async_event;
2100         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2101         cmd->common.flags |= NVME_CMD_SGL_METABUF;
2102         nvme_tcp_set_sg_null(cmd);
2103
2104         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2105         ctrl->async_req.offset = 0;
2106         ctrl->async_req.curr_bio = NULL;
2107         ctrl->async_req.data_len = 0;
2108
2109         nvme_tcp_queue_request(&ctrl->async_req, true);
2110 }
2111
2112 static enum blk_eh_timer_return
2113 nvme_tcp_timeout(struct request *rq, bool reserved)
2114 {
2115         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2116         struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
2117         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2118
2119         /*
2120          * Restart the timer if a controller reset is already scheduled. Any
2121          * timed out commands would be handled before entering the connecting
2122          * state.
2123          */
2124         if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
2125                 return BLK_EH_RESET_TIMER;
2126
2127         dev_warn(ctrl->ctrl.device,
2128                 "queue %d: timeout request %#x type %d\n",
2129                 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2130
2131         if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
2132                 /*
2133                  * Teardown immediately if controller times out while starting
2134                  * or we are already started error recovery. all outstanding
2135                  * requests are completed on shutdown, so we return BLK_EH_DONE.
2136                  */
2137                 flush_work(&ctrl->err_work);
2138                 nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
2139                 nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
2140                 return BLK_EH_DONE;
2141         }
2142
2143         dev_warn(ctrl->ctrl.device, "starting error recovery\n");
2144         nvme_tcp_error_recovery(&ctrl->ctrl);
2145
2146         return BLK_EH_RESET_TIMER;
2147 }
2148
2149 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2150                         struct request *rq)
2151 {
2152         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2153         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2154         struct nvme_command *c = &pdu->cmd;
2155
2156         c->common.flags |= NVME_CMD_SGL_METABUF;
2157
2158         if (!blk_rq_nr_phys_segments(rq))
2159                 nvme_tcp_set_sg_null(c);
2160         else if (rq_data_dir(rq) == WRITE &&
2161             req->data_len <= nvme_tcp_inline_data_size(queue))
2162                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2163         else
2164                 nvme_tcp_set_sg_host_data(c, req->data_len);
2165
2166         return 0;
2167 }
2168
2169 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2170                 struct request *rq)
2171 {
2172         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2173         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2174         struct nvme_tcp_queue *queue = req->queue;
2175         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2176         blk_status_t ret;
2177
2178         ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
2179         if (ret)
2180                 return ret;
2181
2182         req->state = NVME_TCP_SEND_CMD_PDU;
2183         req->offset = 0;
2184         req->data_sent = 0;
2185         req->pdu_len = 0;
2186         req->pdu_sent = 0;
2187         req->data_len = blk_rq_nr_phys_segments(rq) ?
2188                                 blk_rq_payload_bytes(rq) : 0;
2189         req->curr_bio = rq->bio;
2190
2191         if (rq_data_dir(rq) == WRITE &&
2192             req->data_len <= nvme_tcp_inline_data_size(queue))
2193                 req->pdu_len = req->data_len;
2194         else if (req->curr_bio)
2195                 nvme_tcp_init_iter(req, READ);
2196
2197         pdu->hdr.type = nvme_tcp_cmd;
2198         pdu->hdr.flags = 0;
2199         if (queue->hdr_digest)
2200                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2201         if (queue->data_digest && req->pdu_len) {
2202                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2203                 ddgst = nvme_tcp_ddgst_len(queue);
2204         }
2205         pdu->hdr.hlen = sizeof(*pdu);
2206         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2207         pdu->hdr.plen =
2208                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2209
2210         ret = nvme_tcp_map_data(queue, rq);
2211         if (unlikely(ret)) {
2212                 nvme_cleanup_cmd(rq);
2213                 dev_err(queue->ctrl->ctrl.device,
2214                         "Failed to map data (%d)\n", ret);
2215                 return ret;
2216         }
2217
2218         return 0;
2219 }
2220
2221 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2222                 const struct blk_mq_queue_data *bd)
2223 {
2224         struct nvme_ns *ns = hctx->queue->queuedata;
2225         struct nvme_tcp_queue *queue = hctx->driver_data;
2226         struct request *rq = bd->rq;
2227         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2228         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2229         blk_status_t ret;
2230
2231         if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2232                 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2233
2234         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2235         if (unlikely(ret))
2236                 return ret;
2237
2238         blk_mq_start_request(rq);
2239
2240         nvme_tcp_queue_request(req, true);
2241
2242         return BLK_STS_OK;
2243 }
2244
2245 static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2246 {
2247         struct nvme_tcp_ctrl *ctrl = set->driver_data;
2248         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2249
2250         if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2251                 /* separate read/write queues */
2252                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2253                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2254                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2255                 set->map[HCTX_TYPE_READ].nr_queues =
2256                         ctrl->io_queues[HCTX_TYPE_READ];
2257                 set->map[HCTX_TYPE_READ].queue_offset =
2258                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2259         } else {
2260                 /* shared read/write queues */
2261                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2262                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2263                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2264                 set->map[HCTX_TYPE_READ].nr_queues =
2265                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2266                 set->map[HCTX_TYPE_READ].queue_offset = 0;
2267         }
2268         blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2269         blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2270
2271         if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2272                 /* map dedicated poll queues only if we have queues left */
2273                 set->map[HCTX_TYPE_POLL].nr_queues =
2274                                 ctrl->io_queues[HCTX_TYPE_POLL];
2275                 set->map[HCTX_TYPE_POLL].queue_offset =
2276                         ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2277                         ctrl->io_queues[HCTX_TYPE_READ];
2278                 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2279         }
2280
2281         dev_info(ctrl->ctrl.device,
2282                 "mapped %d/%d/%d default/read/poll queues.\n",
2283                 ctrl->io_queues[HCTX_TYPE_DEFAULT],
2284                 ctrl->io_queues[HCTX_TYPE_READ],
2285                 ctrl->io_queues[HCTX_TYPE_POLL]);
2286
2287         return 0;
2288 }
2289
2290 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2291 {
2292         struct nvme_tcp_queue *queue = hctx->driver_data;
2293         struct sock *sk = queue->sock->sk;
2294
2295         if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2296                 return 0;
2297
2298         set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2299         if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2300                 sk_busy_loop(sk, true);
2301         nvme_tcp_try_recv(queue);
2302         clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2303         return queue->nr_cqe;
2304 }
2305
2306 static const struct blk_mq_ops nvme_tcp_mq_ops = {
2307         .queue_rq       = nvme_tcp_queue_rq,
2308         .complete       = nvme_complete_rq,
2309         .init_request   = nvme_tcp_init_request,
2310         .exit_request   = nvme_tcp_exit_request,
2311         .init_hctx      = nvme_tcp_init_hctx,
2312         .timeout        = nvme_tcp_timeout,
2313         .map_queues     = nvme_tcp_map_queues,
2314         .poll           = nvme_tcp_poll,
2315 };
2316
2317 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2318         .queue_rq       = nvme_tcp_queue_rq,
2319         .complete       = nvme_complete_rq,
2320         .init_request   = nvme_tcp_init_request,
2321         .exit_request   = nvme_tcp_exit_request,
2322         .init_hctx      = nvme_tcp_init_admin_hctx,
2323         .timeout        = nvme_tcp_timeout,
2324 };
2325
2326 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2327         .name                   = "tcp",
2328         .module                 = THIS_MODULE,
2329         .flags                  = NVME_F_FABRICS,
2330         .reg_read32             = nvmf_reg_read32,
2331         .reg_read64             = nvmf_reg_read64,
2332         .reg_write32            = nvmf_reg_write32,
2333         .free_ctrl              = nvme_tcp_free_ctrl,
2334         .submit_async_event     = nvme_tcp_submit_async_event,
2335         .delete_ctrl            = nvme_tcp_delete_ctrl,
2336         .get_address            = nvmf_get_address,
2337 };
2338
2339 static bool
2340 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2341 {
2342         struct nvme_tcp_ctrl *ctrl;
2343         bool found = false;
2344
2345         mutex_lock(&nvme_tcp_ctrl_mutex);
2346         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2347                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2348                 if (found)
2349                         break;
2350         }
2351         mutex_unlock(&nvme_tcp_ctrl_mutex);
2352
2353         return found;
2354 }
2355
2356 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2357                 struct nvmf_ctrl_options *opts)
2358 {
2359         struct nvme_tcp_ctrl *ctrl;
2360         int ret;
2361
2362         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2363         if (!ctrl)
2364                 return ERR_PTR(-ENOMEM);
2365
2366         INIT_LIST_HEAD(&ctrl->list);
2367         ctrl->ctrl.opts = opts;
2368         ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2369                                 opts->nr_poll_queues + 1;
2370         ctrl->ctrl.sqsize = opts->queue_size - 1;
2371         ctrl->ctrl.kato = opts->kato;
2372
2373         INIT_DELAYED_WORK(&ctrl->connect_work,
2374                         nvme_tcp_reconnect_ctrl_work);
2375         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2376         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2377
2378         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2379                 opts->trsvcid =
2380                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2381                 if (!opts->trsvcid) {
2382                         ret = -ENOMEM;
2383                         goto out_free_ctrl;
2384                 }
2385                 opts->mask |= NVMF_OPT_TRSVCID;
2386         }
2387
2388         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2389                         opts->traddr, opts->trsvcid, &ctrl->addr);
2390         if (ret) {
2391                 pr_err("malformed address passed: %s:%s\n",
2392                         opts->traddr, opts->trsvcid);
2393                 goto out_free_ctrl;
2394         }
2395
2396         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2397                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2398                         opts->host_traddr, NULL, &ctrl->src_addr);
2399                 if (ret) {
2400                         pr_err("malformed src address passed: %s\n",
2401                                opts->host_traddr);
2402                         goto out_free_ctrl;
2403                 }
2404         }
2405
2406         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2407                 ret = -EALREADY;
2408                 goto out_free_ctrl;
2409         }
2410
2411         ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2412                                 GFP_KERNEL);
2413         if (!ctrl->queues) {
2414                 ret = -ENOMEM;
2415                 goto out_free_ctrl;
2416         }
2417
2418         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2419         if (ret)
2420                 goto out_kfree_queues;
2421
2422         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2423                 WARN_ON_ONCE(1);
2424                 ret = -EINTR;
2425                 goto out_uninit_ctrl;
2426         }
2427
2428         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2429         if (ret)
2430                 goto out_uninit_ctrl;
2431
2432         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2433                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2434
2435         mutex_lock(&nvme_tcp_ctrl_mutex);
2436         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2437         mutex_unlock(&nvme_tcp_ctrl_mutex);
2438
2439         return &ctrl->ctrl;
2440
2441 out_uninit_ctrl:
2442         nvme_uninit_ctrl(&ctrl->ctrl);
2443         nvme_put_ctrl(&ctrl->ctrl);
2444         if (ret > 0)
2445                 ret = -EIO;
2446         return ERR_PTR(ret);
2447 out_kfree_queues:
2448         kfree(ctrl->queues);
2449 out_free_ctrl:
2450         kfree(ctrl);
2451         return ERR_PTR(ret);
2452 }
2453
2454 static struct nvmf_transport_ops nvme_tcp_transport = {
2455         .name           = "tcp",
2456         .module         = THIS_MODULE,
2457         .required_opts  = NVMF_OPT_TRADDR,
2458         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2459                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2460                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2461                           NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2462                           NVMF_OPT_TOS,
2463         .create_ctrl    = nvme_tcp_create_ctrl,
2464 };
2465
2466 static int __init nvme_tcp_init_module(void)
2467 {
2468         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2469                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2470         if (!nvme_tcp_wq)
2471                 return -ENOMEM;
2472
2473         nvmf_register_transport(&nvme_tcp_transport);
2474         return 0;
2475 }
2476
2477 static void __exit nvme_tcp_cleanup_module(void)
2478 {
2479         struct nvme_tcp_ctrl *ctrl;
2480
2481         nvmf_unregister_transport(&nvme_tcp_transport);
2482
2483         mutex_lock(&nvme_tcp_ctrl_mutex);
2484         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2485                 nvme_delete_ctrl(&ctrl->ctrl);
2486         mutex_unlock(&nvme_tcp_ctrl_mutex);
2487         flush_workqueue(nvme_delete_wq);
2488
2489         destroy_workqueue(nvme_tcp_wq);
2490 }
2491
2492 module_init(nvme_tcp_init_module);
2493 module_exit(nvme_tcp_cleanup_module);
2494
2495 MODULE_LICENSE("GPL v2");