Merge branch 'stable/for-jens-5.1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / drivers / nvme / host / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP host.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/blk-mq.h>
15 #include <crypto/hash.h>
16
17 #include "nvme.h"
18 #include "fabrics.h"
19
20 struct nvme_tcp_queue;
21
22 enum nvme_tcp_send_state {
23         NVME_TCP_SEND_CMD_PDU = 0,
24         NVME_TCP_SEND_H2C_PDU,
25         NVME_TCP_SEND_DATA,
26         NVME_TCP_SEND_DDGST,
27 };
28
29 struct nvme_tcp_request {
30         struct nvme_request     req;
31         void                    *pdu;
32         struct nvme_tcp_queue   *queue;
33         u32                     data_len;
34         u32                     pdu_len;
35         u32                     pdu_sent;
36         u16                     ttag;
37         struct list_head        entry;
38         __le32                  ddgst;
39
40         struct bio              *curr_bio;
41         struct iov_iter         iter;
42
43         /* send state */
44         size_t                  offset;
45         size_t                  data_sent;
46         enum nvme_tcp_send_state state;
47 };
48
49 enum nvme_tcp_queue_flags {
50         NVME_TCP_Q_ALLOCATED    = 0,
51         NVME_TCP_Q_LIVE         = 1,
52 };
53
54 enum nvme_tcp_recv_state {
55         NVME_TCP_RECV_PDU = 0,
56         NVME_TCP_RECV_DATA,
57         NVME_TCP_RECV_DDGST,
58 };
59
60 struct nvme_tcp_ctrl;
61 struct nvme_tcp_queue {
62         struct socket           *sock;
63         struct work_struct      io_work;
64         int                     io_cpu;
65
66         spinlock_t              lock;
67         struct list_head        send_list;
68
69         /* recv state */
70         void                    *pdu;
71         int                     pdu_remaining;
72         int                     pdu_offset;
73         size_t                  data_remaining;
74         size_t                  ddgst_remaining;
75
76         /* send state */
77         struct nvme_tcp_request *request;
78
79         int                     queue_size;
80         size_t                  cmnd_capsule_len;
81         struct nvme_tcp_ctrl    *ctrl;
82         unsigned long           flags;
83         bool                    rd_enabled;
84
85         bool                    hdr_digest;
86         bool                    data_digest;
87         struct ahash_request    *rcv_hash;
88         struct ahash_request    *snd_hash;
89         __le32                  exp_ddgst;
90         __le32                  recv_ddgst;
91
92         struct page_frag_cache  pf_cache;
93
94         void (*state_change)(struct sock *);
95         void (*data_ready)(struct sock *);
96         void (*write_space)(struct sock *);
97 };
98
99 struct nvme_tcp_ctrl {
100         /* read only in the hot path */
101         struct nvme_tcp_queue   *queues;
102         struct blk_mq_tag_set   tag_set;
103
104         /* other member variables */
105         struct list_head        list;
106         struct blk_mq_tag_set   admin_tag_set;
107         struct sockaddr_storage addr;
108         struct sockaddr_storage src_addr;
109         struct nvme_ctrl        ctrl;
110
111         struct work_struct      err_work;
112         struct delayed_work     connect_work;
113         struct nvme_tcp_request async_req;
114 };
115
116 static LIST_HEAD(nvme_tcp_ctrl_list);
117 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
118 static struct workqueue_struct *nvme_tcp_wq;
119 static struct blk_mq_ops nvme_tcp_mq_ops;
120 static struct blk_mq_ops nvme_tcp_admin_mq_ops;
121
122 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
123 {
124         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
125 }
126
127 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
128 {
129         return queue - queue->ctrl->queues;
130 }
131
132 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
133 {
134         u32 queue_idx = nvme_tcp_queue_id(queue);
135
136         if (queue_idx == 0)
137                 return queue->ctrl->admin_tag_set.tags[queue_idx];
138         return queue->ctrl->tag_set.tags[queue_idx - 1];
139 }
140
141 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
142 {
143         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
144 }
145
146 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
147 {
148         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
149 }
150
151 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
152 {
153         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
154 }
155
156 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
157 {
158         return req == &req->queue->ctrl->async_req;
159 }
160
161 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
162 {
163         struct request *rq;
164         unsigned int bytes;
165
166         if (unlikely(nvme_tcp_async_req(req)))
167                 return false; /* async events don't have a request */
168
169         rq = blk_mq_rq_from_pdu(req);
170         bytes = blk_rq_payload_bytes(rq);
171
172         return rq_data_dir(rq) == WRITE && bytes &&
173                 bytes <= nvme_tcp_inline_data_size(req->queue);
174 }
175
176 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
177 {
178         return req->iter.bvec->bv_page;
179 }
180
181 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
182 {
183         return req->iter.bvec->bv_offset + req->iter.iov_offset;
184 }
185
186 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
187 {
188         return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
189                         req->pdu_len - req->pdu_sent);
190 }
191
192 static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
193 {
194         return req->iter.iov_offset;
195 }
196
197 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
198 {
199         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
200                         req->pdu_len - req->pdu_sent : 0;
201 }
202
203 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
204                 int len)
205 {
206         return nvme_tcp_pdu_data_left(req) <= len;
207 }
208
209 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
210                 unsigned int dir)
211 {
212         struct request *rq = blk_mq_rq_from_pdu(req);
213         struct bio_vec *vec;
214         unsigned int size;
215         int nsegs;
216         size_t offset;
217
218         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
219                 vec = &rq->special_vec;
220                 nsegs = 1;
221                 size = blk_rq_payload_bytes(rq);
222                 offset = 0;
223         } else {
224                 struct bio *bio = req->curr_bio;
225
226                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
227                 nsegs = bio_segments(bio);
228                 size = bio->bi_iter.bi_size;
229                 offset = bio->bi_iter.bi_bvec_done;
230         }
231
232         iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
233         req->iter.iov_offset = offset;
234 }
235
236 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
237                 int len)
238 {
239         req->data_sent += len;
240         req->pdu_sent += len;
241         iov_iter_advance(&req->iter, len);
242         if (!iov_iter_count(&req->iter) &&
243             req->data_sent < req->data_len) {
244                 req->curr_bio = req->curr_bio->bi_next;
245                 nvme_tcp_init_iter(req, WRITE);
246         }
247 }
248
249 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
250 {
251         struct nvme_tcp_queue *queue = req->queue;
252
253         spin_lock(&queue->lock);
254         list_add_tail(&req->entry, &queue->send_list);
255         spin_unlock(&queue->lock);
256
257         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
258 }
259
260 static inline struct nvme_tcp_request *
261 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
262 {
263         struct nvme_tcp_request *req;
264
265         spin_lock(&queue->lock);
266         req = list_first_entry_or_null(&queue->send_list,
267                         struct nvme_tcp_request, entry);
268         if (req)
269                 list_del(&req->entry);
270         spin_unlock(&queue->lock);
271
272         return req;
273 }
274
275 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
276                 __le32 *dgst)
277 {
278         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
279         crypto_ahash_final(hash);
280 }
281
282 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
283                 struct page *page, off_t off, size_t len)
284 {
285         struct scatterlist sg;
286
287         sg_init_marker(&sg, 1);
288         sg_set_page(&sg, page, len, off);
289         ahash_request_set_crypt(hash, &sg, NULL, len);
290         crypto_ahash_update(hash);
291 }
292
293 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
294                 void *pdu, size_t len)
295 {
296         struct scatterlist sg;
297
298         sg_init_one(&sg, pdu, len);
299         ahash_request_set_crypt(hash, &sg, pdu + len, len);
300         crypto_ahash_digest(hash);
301 }
302
303 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
304                 void *pdu, size_t pdu_len)
305 {
306         struct nvme_tcp_hdr *hdr = pdu;
307         __le32 recv_digest;
308         __le32 exp_digest;
309
310         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
311                 dev_err(queue->ctrl->ctrl.device,
312                         "queue %d: header digest flag is cleared\n",
313                         nvme_tcp_queue_id(queue));
314                 return -EPROTO;
315         }
316
317         recv_digest = *(__le32 *)(pdu + hdr->hlen);
318         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
319         exp_digest = *(__le32 *)(pdu + hdr->hlen);
320         if (recv_digest != exp_digest) {
321                 dev_err(queue->ctrl->ctrl.device,
322                         "header digest error: recv %#x expected %#x\n",
323                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
324                 return -EIO;
325         }
326
327         return 0;
328 }
329
330 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
331 {
332         struct nvme_tcp_hdr *hdr = pdu;
333         u8 digest_len = nvme_tcp_hdgst_len(queue);
334         u32 len;
335
336         len = le32_to_cpu(hdr->plen) - hdr->hlen -
337                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
338
339         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
340                 dev_err(queue->ctrl->ctrl.device,
341                         "queue %d: data digest flag is cleared\n",
342                 nvme_tcp_queue_id(queue));
343                 return -EPROTO;
344         }
345         crypto_ahash_init(queue->rcv_hash);
346
347         return 0;
348 }
349
350 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
351                 struct request *rq, unsigned int hctx_idx)
352 {
353         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
354
355         page_frag_free(req->pdu);
356 }
357
358 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
359                 struct request *rq, unsigned int hctx_idx,
360                 unsigned int numa_node)
361 {
362         struct nvme_tcp_ctrl *ctrl = set->driver_data;
363         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
364         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
365         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
366         u8 hdgst = nvme_tcp_hdgst_len(queue);
367
368         req->pdu = page_frag_alloc(&queue->pf_cache,
369                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
370                 GFP_KERNEL | __GFP_ZERO);
371         if (!req->pdu)
372                 return -ENOMEM;
373
374         req->queue = queue;
375         nvme_req(rq)->ctrl = &ctrl->ctrl;
376
377         return 0;
378 }
379
380 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
381                 unsigned int hctx_idx)
382 {
383         struct nvme_tcp_ctrl *ctrl = data;
384         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
385
386         hctx->driver_data = queue;
387         return 0;
388 }
389
390 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
391                 unsigned int hctx_idx)
392 {
393         struct nvme_tcp_ctrl *ctrl = data;
394         struct nvme_tcp_queue *queue = &ctrl->queues[0];
395
396         hctx->driver_data = queue;
397         return 0;
398 }
399
400 static enum nvme_tcp_recv_state
401 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
402 {
403         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
404                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
405                 NVME_TCP_RECV_DATA;
406 }
407
408 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
409 {
410         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
411                                 nvme_tcp_hdgst_len(queue);
412         queue->pdu_offset = 0;
413         queue->data_remaining = -1;
414         queue->ddgst_remaining = 0;
415 }
416
417 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
418 {
419         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
420                 return;
421
422         queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
423 }
424
425 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
426                 struct nvme_completion *cqe)
427 {
428         struct request *rq;
429
430         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
431         if (!rq) {
432                 dev_err(queue->ctrl->ctrl.device,
433                         "queue %d tag 0x%x not found\n",
434                         nvme_tcp_queue_id(queue), cqe->command_id);
435                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
436                 return -EINVAL;
437         }
438
439         nvme_end_request(rq, cqe->status, cqe->result);
440
441         return 0;
442 }
443
444 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
445                 struct nvme_tcp_data_pdu *pdu)
446 {
447         struct request *rq;
448
449         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
450         if (!rq) {
451                 dev_err(queue->ctrl->ctrl.device,
452                         "queue %d tag %#x not found\n",
453                         nvme_tcp_queue_id(queue), pdu->command_id);
454                 return -ENOENT;
455         }
456
457         if (!blk_rq_payload_bytes(rq)) {
458                 dev_err(queue->ctrl->ctrl.device,
459                         "queue %d tag %#x unexpected data\n",
460                         nvme_tcp_queue_id(queue), rq->tag);
461                 return -EIO;
462         }
463
464         queue->data_remaining = le32_to_cpu(pdu->data_length);
465
466         return 0;
467
468 }
469
470 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
471                 struct nvme_tcp_rsp_pdu *pdu)
472 {
473         struct nvme_completion *cqe = &pdu->cqe;
474         int ret = 0;
475
476         /*
477          * AEN requests are special as they don't time out and can
478          * survive any kind of queue freeze and often don't respond to
479          * aborts.  We don't even bother to allocate a struct request
480          * for them but rather special case them here.
481          */
482         if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
483             cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
484                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
485                                 &cqe->result);
486         else
487                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
488
489         return ret;
490 }
491
492 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
493                 struct nvme_tcp_r2t_pdu *pdu)
494 {
495         struct nvme_tcp_data_pdu *data = req->pdu;
496         struct nvme_tcp_queue *queue = req->queue;
497         struct request *rq = blk_mq_rq_from_pdu(req);
498         u8 hdgst = nvme_tcp_hdgst_len(queue);
499         u8 ddgst = nvme_tcp_ddgst_len(queue);
500
501         req->pdu_len = le32_to_cpu(pdu->r2t_length);
502         req->pdu_sent = 0;
503
504         if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
505                 dev_err(queue->ctrl->ctrl.device,
506                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
507                         rq->tag, req->pdu_len, req->data_len,
508                         req->data_sent);
509                 return -EPROTO;
510         }
511
512         if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
513                 dev_err(queue->ctrl->ctrl.device,
514                         "req %d unexpected r2t offset %u (expected %zu)\n",
515                         rq->tag, le32_to_cpu(pdu->r2t_offset),
516                         req->data_sent);
517                 return -EPROTO;
518         }
519
520         memset(data, 0, sizeof(*data));
521         data->hdr.type = nvme_tcp_h2c_data;
522         data->hdr.flags = NVME_TCP_F_DATA_LAST;
523         if (queue->hdr_digest)
524                 data->hdr.flags |= NVME_TCP_F_HDGST;
525         if (queue->data_digest)
526                 data->hdr.flags |= NVME_TCP_F_DDGST;
527         data->hdr.hlen = sizeof(*data);
528         data->hdr.pdo = data->hdr.hlen + hdgst;
529         data->hdr.plen =
530                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
531         data->ttag = pdu->ttag;
532         data->command_id = rq->tag;
533         data->data_offset = cpu_to_le32(req->data_sent);
534         data->data_length = cpu_to_le32(req->pdu_len);
535         return 0;
536 }
537
538 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
539                 struct nvme_tcp_r2t_pdu *pdu)
540 {
541         struct nvme_tcp_request *req;
542         struct request *rq;
543         int ret;
544
545         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
546         if (!rq) {
547                 dev_err(queue->ctrl->ctrl.device,
548                         "queue %d tag %#x not found\n",
549                         nvme_tcp_queue_id(queue), pdu->command_id);
550                 return -ENOENT;
551         }
552         req = blk_mq_rq_to_pdu(rq);
553
554         ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
555         if (unlikely(ret))
556                 return ret;
557
558         req->state = NVME_TCP_SEND_H2C_PDU;
559         req->offset = 0;
560
561         nvme_tcp_queue_request(req);
562
563         return 0;
564 }
565
566 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
567                 unsigned int *offset, size_t *len)
568 {
569         struct nvme_tcp_hdr *hdr;
570         char *pdu = queue->pdu;
571         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
572         int ret;
573
574         ret = skb_copy_bits(skb, *offset,
575                 &pdu[queue->pdu_offset], rcv_len);
576         if (unlikely(ret))
577                 return ret;
578
579         queue->pdu_remaining -= rcv_len;
580         queue->pdu_offset += rcv_len;
581         *offset += rcv_len;
582         *len -= rcv_len;
583         if (queue->pdu_remaining)
584                 return 0;
585
586         hdr = queue->pdu;
587         if (queue->hdr_digest) {
588                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
589                 if (unlikely(ret))
590                         return ret;
591         }
592
593
594         if (queue->data_digest) {
595                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
596                 if (unlikely(ret))
597                         return ret;
598         }
599
600         switch (hdr->type) {
601         case nvme_tcp_c2h_data:
602                 ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
603                 break;
604         case nvme_tcp_rsp:
605                 nvme_tcp_init_recv_ctx(queue);
606                 ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
607                 break;
608         case nvme_tcp_r2t:
609                 nvme_tcp_init_recv_ctx(queue);
610                 ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
611                 break;
612         default:
613                 dev_err(queue->ctrl->ctrl.device,
614                         "unsupported pdu type (%d)\n", hdr->type);
615                 return -EINVAL;
616         }
617
618         return ret;
619 }
620
621 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
622                               unsigned int *offset, size_t *len)
623 {
624         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
625         struct nvme_tcp_request *req;
626         struct request *rq;
627
628         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
629         if (!rq) {
630                 dev_err(queue->ctrl->ctrl.device,
631                         "queue %d tag %#x not found\n",
632                         nvme_tcp_queue_id(queue), pdu->command_id);
633                 return -ENOENT;
634         }
635         req = blk_mq_rq_to_pdu(rq);
636
637         while (true) {
638                 int recv_len, ret;
639
640                 recv_len = min_t(size_t, *len, queue->data_remaining);
641                 if (!recv_len)
642                         break;
643
644                 if (!iov_iter_count(&req->iter)) {
645                         req->curr_bio = req->curr_bio->bi_next;
646
647                         /*
648                          * If we don`t have any bios it means that controller
649                          * sent more data than we requested, hence error
650                          */
651                         if (!req->curr_bio) {
652                                 dev_err(queue->ctrl->ctrl.device,
653                                         "queue %d no space in request %#x",
654                                         nvme_tcp_queue_id(queue), rq->tag);
655                                 nvme_tcp_init_recv_ctx(queue);
656                                 return -EIO;
657                         }
658                         nvme_tcp_init_iter(req, READ);
659                 }
660
661                 /* we can read only from what is left in this bio */
662                 recv_len = min_t(size_t, recv_len,
663                                 iov_iter_count(&req->iter));
664
665                 if (queue->data_digest)
666                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
667                                 &req->iter, recv_len, queue->rcv_hash);
668                 else
669                         ret = skb_copy_datagram_iter(skb, *offset,
670                                         &req->iter, recv_len);
671                 if (ret) {
672                         dev_err(queue->ctrl->ctrl.device,
673                                 "queue %d failed to copy request %#x data",
674                                 nvme_tcp_queue_id(queue), rq->tag);
675                         return ret;
676                 }
677
678                 *len -= recv_len;
679                 *offset += recv_len;
680                 queue->data_remaining -= recv_len;
681         }
682
683         if (!queue->data_remaining) {
684                 if (queue->data_digest) {
685                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
686                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
687                 } else {
688                         nvme_tcp_init_recv_ctx(queue);
689                 }
690         }
691
692         return 0;
693 }
694
695 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
696                 struct sk_buff *skb, unsigned int *offset, size_t *len)
697 {
698         char *ddgst = (char *)&queue->recv_ddgst;
699         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
700         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
701         int ret;
702
703         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
704         if (unlikely(ret))
705                 return ret;
706
707         queue->ddgst_remaining -= recv_len;
708         *offset += recv_len;
709         *len -= recv_len;
710         if (queue->ddgst_remaining)
711                 return 0;
712
713         if (queue->recv_ddgst != queue->exp_ddgst) {
714                 dev_err(queue->ctrl->ctrl.device,
715                         "data digest error: recv %#x expected %#x\n",
716                         le32_to_cpu(queue->recv_ddgst),
717                         le32_to_cpu(queue->exp_ddgst));
718                 return -EIO;
719         }
720
721         nvme_tcp_init_recv_ctx(queue);
722         return 0;
723 }
724
725 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
726                              unsigned int offset, size_t len)
727 {
728         struct nvme_tcp_queue *queue = desc->arg.data;
729         size_t consumed = len;
730         int result;
731
732         while (len) {
733                 switch (nvme_tcp_recv_state(queue)) {
734                 case NVME_TCP_RECV_PDU:
735                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
736                         break;
737                 case NVME_TCP_RECV_DATA:
738                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
739                         break;
740                 case NVME_TCP_RECV_DDGST:
741                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
742                         break;
743                 default:
744                         result = -EFAULT;
745                 }
746                 if (result) {
747                         dev_err(queue->ctrl->ctrl.device,
748                                 "receive failed:  %d\n", result);
749                         queue->rd_enabled = false;
750                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
751                         return result;
752                 }
753         }
754
755         return consumed;
756 }
757
758 static void nvme_tcp_data_ready(struct sock *sk)
759 {
760         struct nvme_tcp_queue *queue;
761
762         read_lock(&sk->sk_callback_lock);
763         queue = sk->sk_user_data;
764         if (likely(queue && queue->rd_enabled))
765                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
766         read_unlock(&sk->sk_callback_lock);
767 }
768
769 static void nvme_tcp_write_space(struct sock *sk)
770 {
771         struct nvme_tcp_queue *queue;
772
773         read_lock_bh(&sk->sk_callback_lock);
774         queue = sk->sk_user_data;
775         if (likely(queue && sk_stream_is_writeable(sk))) {
776                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
777                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
778         }
779         read_unlock_bh(&sk->sk_callback_lock);
780 }
781
782 static void nvme_tcp_state_change(struct sock *sk)
783 {
784         struct nvme_tcp_queue *queue;
785
786         read_lock(&sk->sk_callback_lock);
787         queue = sk->sk_user_data;
788         if (!queue)
789                 goto done;
790
791         switch (sk->sk_state) {
792         case TCP_CLOSE:
793         case TCP_CLOSE_WAIT:
794         case TCP_LAST_ACK:
795         case TCP_FIN_WAIT1:
796         case TCP_FIN_WAIT2:
797                 /* fallthrough */
798                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
799                 break;
800         default:
801                 dev_info(queue->ctrl->ctrl.device,
802                         "queue %d socket state %d\n",
803                         nvme_tcp_queue_id(queue), sk->sk_state);
804         }
805
806         queue->state_change(sk);
807 done:
808         read_unlock(&sk->sk_callback_lock);
809 }
810
811 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
812 {
813         queue->request = NULL;
814 }
815
816 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
817 {
818         union nvme_result res = {};
819
820         nvme_end_request(blk_mq_rq_from_pdu(req),
821                 cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res);
822 }
823
824 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
825 {
826         struct nvme_tcp_queue *queue = req->queue;
827
828         while (true) {
829                 struct page *page = nvme_tcp_req_cur_page(req);
830                 size_t offset = nvme_tcp_req_cur_offset(req);
831                 size_t len = nvme_tcp_req_cur_length(req);
832                 bool last = nvme_tcp_pdu_last_send(req, len);
833                 int ret, flags = MSG_DONTWAIT;
834
835                 if (last && !queue->data_digest)
836                         flags |= MSG_EOR;
837                 else
838                         flags |= MSG_MORE;
839
840                 ret = kernel_sendpage(queue->sock, page, offset, len, flags);
841                 if (ret <= 0)
842                         return ret;
843
844                 nvme_tcp_advance_req(req, ret);
845                 if (queue->data_digest)
846                         nvme_tcp_ddgst_update(queue->snd_hash, page,
847                                         offset, ret);
848
849                 /* fully successful last write*/
850                 if (last && ret == len) {
851                         if (queue->data_digest) {
852                                 nvme_tcp_ddgst_final(queue->snd_hash,
853                                         &req->ddgst);
854                                 req->state = NVME_TCP_SEND_DDGST;
855                                 req->offset = 0;
856                         } else {
857                                 nvme_tcp_done_send_req(queue);
858                         }
859                         return 1;
860                 }
861         }
862         return -EAGAIN;
863 }
864
865 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
866 {
867         struct nvme_tcp_queue *queue = req->queue;
868         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
869         bool inline_data = nvme_tcp_has_inline_data(req);
870         int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
871         u8 hdgst = nvme_tcp_hdgst_len(queue);
872         int len = sizeof(*pdu) + hdgst - req->offset;
873         int ret;
874
875         if (queue->hdr_digest && !req->offset)
876                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
877
878         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
879                         offset_in_page(pdu) + req->offset, len,  flags);
880         if (unlikely(ret <= 0))
881                 return ret;
882
883         len -= ret;
884         if (!len) {
885                 if (inline_data) {
886                         req->state = NVME_TCP_SEND_DATA;
887                         if (queue->data_digest)
888                                 crypto_ahash_init(queue->snd_hash);
889                         nvme_tcp_init_iter(req, WRITE);
890                 } else {
891                         nvme_tcp_done_send_req(queue);
892                 }
893                 return 1;
894         }
895         req->offset += ret;
896
897         return -EAGAIN;
898 }
899
900 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
901 {
902         struct nvme_tcp_queue *queue = req->queue;
903         struct nvme_tcp_data_pdu *pdu = req->pdu;
904         u8 hdgst = nvme_tcp_hdgst_len(queue);
905         int len = sizeof(*pdu) - req->offset + hdgst;
906         int ret;
907
908         if (queue->hdr_digest && !req->offset)
909                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
910
911         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
912                         offset_in_page(pdu) + req->offset, len,
913                         MSG_DONTWAIT | MSG_MORE);
914         if (unlikely(ret <= 0))
915                 return ret;
916
917         len -= ret;
918         if (!len) {
919                 req->state = NVME_TCP_SEND_DATA;
920                 if (queue->data_digest)
921                         crypto_ahash_init(queue->snd_hash);
922                 if (!req->data_sent)
923                         nvme_tcp_init_iter(req, WRITE);
924                 return 1;
925         }
926         req->offset += ret;
927
928         return -EAGAIN;
929 }
930
931 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
932 {
933         struct nvme_tcp_queue *queue = req->queue;
934         int ret;
935         struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
936         struct kvec iov = {
937                 .iov_base = &req->ddgst + req->offset,
938                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
939         };
940
941         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
942         if (unlikely(ret <= 0))
943                 return ret;
944
945         if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
946                 nvme_tcp_done_send_req(queue);
947                 return 1;
948         }
949
950         req->offset += ret;
951         return -EAGAIN;
952 }
953
954 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
955 {
956         struct nvme_tcp_request *req;
957         int ret = 1;
958
959         if (!queue->request) {
960                 queue->request = nvme_tcp_fetch_request(queue);
961                 if (!queue->request)
962                         return 0;
963         }
964         req = queue->request;
965
966         if (req->state == NVME_TCP_SEND_CMD_PDU) {
967                 ret = nvme_tcp_try_send_cmd_pdu(req);
968                 if (ret <= 0)
969                         goto done;
970                 if (!nvme_tcp_has_inline_data(req))
971                         return ret;
972         }
973
974         if (req->state == NVME_TCP_SEND_H2C_PDU) {
975                 ret = nvme_tcp_try_send_data_pdu(req);
976                 if (ret <= 0)
977                         goto done;
978         }
979
980         if (req->state == NVME_TCP_SEND_DATA) {
981                 ret = nvme_tcp_try_send_data(req);
982                 if (ret <= 0)
983                         goto done;
984         }
985
986         if (req->state == NVME_TCP_SEND_DDGST)
987                 ret = nvme_tcp_try_send_ddgst(req);
988 done:
989         if (ret == -EAGAIN)
990                 ret = 0;
991         return ret;
992 }
993
994 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
995 {
996         struct sock *sk = queue->sock->sk;
997         read_descriptor_t rd_desc;
998         int consumed;
999
1000         rd_desc.arg.data = queue;
1001         rd_desc.count = 1;
1002         lock_sock(sk);
1003         consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1004         release_sock(sk);
1005         return consumed;
1006 }
1007
1008 static void nvme_tcp_io_work(struct work_struct *w)
1009 {
1010         struct nvme_tcp_queue *queue =
1011                 container_of(w, struct nvme_tcp_queue, io_work);
1012         unsigned long start = jiffies + msecs_to_jiffies(1);
1013
1014         do {
1015                 bool pending = false;
1016                 int result;
1017
1018                 result = nvme_tcp_try_send(queue);
1019                 if (result > 0) {
1020                         pending = true;
1021                 } else if (unlikely(result < 0)) {
1022                         dev_err(queue->ctrl->ctrl.device,
1023                                 "failed to send request %d\n", result);
1024                         if (result != -EPIPE)
1025                                 nvme_tcp_fail_request(queue->request);
1026                         nvme_tcp_done_send_req(queue);
1027                         return;
1028                 }
1029
1030                 result = nvme_tcp_try_recv(queue);
1031                 if (result > 0)
1032                         pending = true;
1033
1034                 if (!pending)
1035                         return;
1036
1037         } while (time_after(jiffies, start)); /* quota is exhausted */
1038
1039         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1040 }
1041
1042 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1043 {
1044         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1045
1046         ahash_request_free(queue->rcv_hash);
1047         ahash_request_free(queue->snd_hash);
1048         crypto_free_ahash(tfm);
1049 }
1050
1051 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1052 {
1053         struct crypto_ahash *tfm;
1054
1055         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1056         if (IS_ERR(tfm))
1057                 return PTR_ERR(tfm);
1058
1059         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1060         if (!queue->snd_hash)
1061                 goto free_tfm;
1062         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1063
1064         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1065         if (!queue->rcv_hash)
1066                 goto free_snd_hash;
1067         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1068
1069         return 0;
1070 free_snd_hash:
1071         ahash_request_free(queue->snd_hash);
1072 free_tfm:
1073         crypto_free_ahash(tfm);
1074         return -ENOMEM;
1075 }
1076
1077 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1078 {
1079         struct nvme_tcp_request *async = &ctrl->async_req;
1080
1081         page_frag_free(async->pdu);
1082 }
1083
1084 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1085 {
1086         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1087         struct nvme_tcp_request *async = &ctrl->async_req;
1088         u8 hdgst = nvme_tcp_hdgst_len(queue);
1089
1090         async->pdu = page_frag_alloc(&queue->pf_cache,
1091                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1092                 GFP_KERNEL | __GFP_ZERO);
1093         if (!async->pdu)
1094                 return -ENOMEM;
1095
1096         async->queue = &ctrl->queues[0];
1097         return 0;
1098 }
1099
1100 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1101 {
1102         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1103         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1104
1105         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1106                 return;
1107
1108         if (queue->hdr_digest || queue->data_digest)
1109                 nvme_tcp_free_crypto(queue);
1110
1111         sock_release(queue->sock);
1112         kfree(queue->pdu);
1113 }
1114
1115 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1116 {
1117         struct nvme_tcp_icreq_pdu *icreq;
1118         struct nvme_tcp_icresp_pdu *icresp;
1119         struct msghdr msg = {};
1120         struct kvec iov;
1121         bool ctrl_hdgst, ctrl_ddgst;
1122         int ret;
1123
1124         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1125         if (!icreq)
1126                 return -ENOMEM;
1127
1128         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1129         if (!icresp) {
1130                 ret = -ENOMEM;
1131                 goto free_icreq;
1132         }
1133
1134         icreq->hdr.type = nvme_tcp_icreq;
1135         icreq->hdr.hlen = sizeof(*icreq);
1136         icreq->hdr.pdo = 0;
1137         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1138         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1139         icreq->maxr2t = 0; /* single inflight r2t supported */
1140         icreq->hpda = 0; /* no alignment constraint */
1141         if (queue->hdr_digest)
1142                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1143         if (queue->data_digest)
1144                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1145
1146         iov.iov_base = icreq;
1147         iov.iov_len = sizeof(*icreq);
1148         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1149         if (ret < 0)
1150                 goto free_icresp;
1151
1152         memset(&msg, 0, sizeof(msg));
1153         iov.iov_base = icresp;
1154         iov.iov_len = sizeof(*icresp);
1155         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1156                         iov.iov_len, msg.msg_flags);
1157         if (ret < 0)
1158                 goto free_icresp;
1159
1160         ret = -EINVAL;
1161         if (icresp->hdr.type != nvme_tcp_icresp) {
1162                 pr_err("queue %d: bad type returned %d\n",
1163                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1164                 goto free_icresp;
1165         }
1166
1167         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1168                 pr_err("queue %d: bad pdu length returned %d\n",
1169                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1170                 goto free_icresp;
1171         }
1172
1173         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1174                 pr_err("queue %d: bad pfv returned %d\n",
1175                         nvme_tcp_queue_id(queue), icresp->pfv);
1176                 goto free_icresp;
1177         }
1178
1179         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1180         if ((queue->data_digest && !ctrl_ddgst) ||
1181             (!queue->data_digest && ctrl_ddgst)) {
1182                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1183                         nvme_tcp_queue_id(queue),
1184                         queue->data_digest ? "enabled" : "disabled",
1185                         ctrl_ddgst ? "enabled" : "disabled");
1186                 goto free_icresp;
1187         }
1188
1189         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1190         if ((queue->hdr_digest && !ctrl_hdgst) ||
1191             (!queue->hdr_digest && ctrl_hdgst)) {
1192                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1193                         nvme_tcp_queue_id(queue),
1194                         queue->hdr_digest ? "enabled" : "disabled",
1195                         ctrl_hdgst ? "enabled" : "disabled");
1196                 goto free_icresp;
1197         }
1198
1199         if (icresp->cpda != 0) {
1200                 pr_err("queue %d: unsupported cpda returned %d\n",
1201                         nvme_tcp_queue_id(queue), icresp->cpda);
1202                 goto free_icresp;
1203         }
1204
1205         ret = 0;
1206 free_icresp:
1207         kfree(icresp);
1208 free_icreq:
1209         kfree(icreq);
1210         return ret;
1211 }
1212
1213 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1214                 int qid, size_t queue_size)
1215 {
1216         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1217         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1218         struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1219         int ret, opt, rcv_pdu_size, n;
1220
1221         queue->ctrl = ctrl;
1222         INIT_LIST_HEAD(&queue->send_list);
1223         spin_lock_init(&queue->lock);
1224         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1225         queue->queue_size = queue_size;
1226
1227         if (qid > 0)
1228                 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
1229         else
1230                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1231                                                 NVME_TCP_ADMIN_CCSZ;
1232
1233         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1234                         IPPROTO_TCP, &queue->sock);
1235         if (ret) {
1236                 dev_err(ctrl->ctrl.device,
1237                         "failed to create socket: %d\n", ret);
1238                 return ret;
1239         }
1240
1241         /* Single syn retry */
1242         opt = 1;
1243         ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
1244                         (char *)&opt, sizeof(opt));
1245         if (ret) {
1246                 dev_err(ctrl->ctrl.device,
1247                         "failed to set TCP_SYNCNT sock opt %d\n", ret);
1248                 goto err_sock;
1249         }
1250
1251         /* Set TCP no delay */
1252         opt = 1;
1253         ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
1254                         TCP_NODELAY, (char *)&opt, sizeof(opt));
1255         if (ret) {
1256                 dev_err(ctrl->ctrl.device,
1257                         "failed to set TCP_NODELAY sock opt %d\n", ret);
1258                 goto err_sock;
1259         }
1260
1261         /*
1262          * Cleanup whatever is sitting in the TCP transmit queue on socket
1263          * close. This is done to prevent stale data from being sent should
1264          * the network connection be restored before TCP times out.
1265          */
1266         ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
1267                         (char *)&sol, sizeof(sol));
1268         if (ret) {
1269                 dev_err(ctrl->ctrl.device,
1270                         "failed to set SO_LINGER sock opt %d\n", ret);
1271                 goto err_sock;
1272         }
1273
1274         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1275         if (!qid)
1276                 n = 0;
1277         else
1278                 n = (qid - 1) % num_online_cpus();
1279         queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1280         queue->request = NULL;
1281         queue->data_remaining = 0;
1282         queue->ddgst_remaining = 0;
1283         queue->pdu_remaining = 0;
1284         queue->pdu_offset = 0;
1285         sk_set_memalloc(queue->sock->sk);
1286
1287         if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
1288                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1289                         sizeof(ctrl->src_addr));
1290                 if (ret) {
1291                         dev_err(ctrl->ctrl.device,
1292                                 "failed to bind queue %d socket %d\n",
1293                                 qid, ret);
1294                         goto err_sock;
1295                 }
1296         }
1297
1298         queue->hdr_digest = nctrl->opts->hdr_digest;
1299         queue->data_digest = nctrl->opts->data_digest;
1300         if (queue->hdr_digest || queue->data_digest) {
1301                 ret = nvme_tcp_alloc_crypto(queue);
1302                 if (ret) {
1303                         dev_err(ctrl->ctrl.device,
1304                                 "failed to allocate queue %d crypto\n", qid);
1305                         goto err_sock;
1306                 }
1307         }
1308
1309         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1310                         nvme_tcp_hdgst_len(queue);
1311         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1312         if (!queue->pdu) {
1313                 ret = -ENOMEM;
1314                 goto err_crypto;
1315         }
1316
1317         dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
1318                         nvme_tcp_queue_id(queue));
1319
1320         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1321                 sizeof(ctrl->addr), 0);
1322         if (ret) {
1323                 dev_err(ctrl->ctrl.device,
1324                         "failed to connect socket: %d\n", ret);
1325                 goto err_rcv_pdu;
1326         }
1327
1328         ret = nvme_tcp_init_connection(queue);
1329         if (ret)
1330                 goto err_init_connect;
1331
1332         queue->rd_enabled = true;
1333         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1334         nvme_tcp_init_recv_ctx(queue);
1335
1336         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1337         queue->sock->sk->sk_user_data = queue;
1338         queue->state_change = queue->sock->sk->sk_state_change;
1339         queue->data_ready = queue->sock->sk->sk_data_ready;
1340         queue->write_space = queue->sock->sk->sk_write_space;
1341         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1342         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1343         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1344         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1345
1346         return 0;
1347
1348 err_init_connect:
1349         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1350 err_rcv_pdu:
1351         kfree(queue->pdu);
1352 err_crypto:
1353         if (queue->hdr_digest || queue->data_digest)
1354                 nvme_tcp_free_crypto(queue);
1355 err_sock:
1356         sock_release(queue->sock);
1357         queue->sock = NULL;
1358         return ret;
1359 }
1360
1361 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1362 {
1363         struct socket *sock = queue->sock;
1364
1365         write_lock_bh(&sock->sk->sk_callback_lock);
1366         sock->sk->sk_user_data  = NULL;
1367         sock->sk->sk_data_ready = queue->data_ready;
1368         sock->sk->sk_state_change = queue->state_change;
1369         sock->sk->sk_write_space  = queue->write_space;
1370         write_unlock_bh(&sock->sk->sk_callback_lock);
1371 }
1372
1373 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1374 {
1375         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1376         nvme_tcp_restore_sock_calls(queue);
1377         cancel_work_sync(&queue->io_work);
1378 }
1379
1380 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1381 {
1382         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1383         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1384
1385         if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1386                 return;
1387
1388         __nvme_tcp_stop_queue(queue);
1389 }
1390
1391 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1392 {
1393         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1394         int ret;
1395
1396         if (idx)
1397                 ret = nvmf_connect_io_queue(nctrl, idx, false);
1398         else
1399                 ret = nvmf_connect_admin_queue(nctrl);
1400
1401         if (!ret) {
1402                 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1403         } else {
1404                 __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1405                 dev_err(nctrl->device,
1406                         "failed to connect queue: %d ret=%d\n", idx, ret);
1407         }
1408         return ret;
1409 }
1410
1411 static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1412                 bool admin)
1413 {
1414         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1415         struct blk_mq_tag_set *set;
1416         int ret;
1417
1418         if (admin) {
1419                 set = &ctrl->admin_tag_set;
1420                 memset(set, 0, sizeof(*set));
1421                 set->ops = &nvme_tcp_admin_mq_ops;
1422                 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1423                 set->reserved_tags = 2; /* connect + keep-alive */
1424                 set->numa_node = NUMA_NO_NODE;
1425                 set->cmd_size = sizeof(struct nvme_tcp_request);
1426                 set->driver_data = ctrl;
1427                 set->nr_hw_queues = 1;
1428                 set->timeout = ADMIN_TIMEOUT;
1429         } else {
1430                 set = &ctrl->tag_set;
1431                 memset(set, 0, sizeof(*set));
1432                 set->ops = &nvme_tcp_mq_ops;
1433                 set->queue_depth = nctrl->sqsize + 1;
1434                 set->reserved_tags = 1; /* fabric connect */
1435                 set->numa_node = NUMA_NO_NODE;
1436                 set->flags = BLK_MQ_F_SHOULD_MERGE;
1437                 set->cmd_size = sizeof(struct nvme_tcp_request);
1438                 set->driver_data = ctrl;
1439                 set->nr_hw_queues = nctrl->queue_count - 1;
1440                 set->timeout = NVME_IO_TIMEOUT;
1441                 set->nr_maps = 2 /* default + read */;
1442         }
1443
1444         ret = blk_mq_alloc_tag_set(set);
1445         if (ret)
1446                 return ERR_PTR(ret);
1447
1448         return set;
1449 }
1450
1451 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1452 {
1453         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1454                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1455                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1456         }
1457
1458         nvme_tcp_free_queue(ctrl, 0);
1459 }
1460
1461 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1462 {
1463         int i;
1464
1465         for (i = 1; i < ctrl->queue_count; i++)
1466                 nvme_tcp_free_queue(ctrl, i);
1467 }
1468
1469 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1470 {
1471         int i;
1472
1473         for (i = 1; i < ctrl->queue_count; i++)
1474                 nvme_tcp_stop_queue(ctrl, i);
1475 }
1476
1477 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1478 {
1479         int i, ret = 0;
1480
1481         for (i = 1; i < ctrl->queue_count; i++) {
1482                 ret = nvme_tcp_start_queue(ctrl, i);
1483                 if (ret)
1484                         goto out_stop_queues;
1485         }
1486
1487         return 0;
1488
1489 out_stop_queues:
1490         for (i--; i >= 1; i--)
1491                 nvme_tcp_stop_queue(ctrl, i);
1492         return ret;
1493 }
1494
1495 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1496 {
1497         int ret;
1498
1499         ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1500         if (ret)
1501                 return ret;
1502
1503         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1504         if (ret)
1505                 goto out_free_queue;
1506
1507         return 0;
1508
1509 out_free_queue:
1510         nvme_tcp_free_queue(ctrl, 0);
1511         return ret;
1512 }
1513
1514 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1515 {
1516         int i, ret;
1517
1518         for (i = 1; i < ctrl->queue_count; i++) {
1519                 ret = nvme_tcp_alloc_queue(ctrl, i,
1520                                 ctrl->sqsize + 1);
1521                 if (ret)
1522                         goto out_free_queues;
1523         }
1524
1525         return 0;
1526
1527 out_free_queues:
1528         for (i--; i >= 1; i--)
1529                 nvme_tcp_free_queue(ctrl, i);
1530
1531         return ret;
1532 }
1533
1534 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1535 {
1536         unsigned int nr_io_queues;
1537
1538         nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1539         nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1540
1541         return nr_io_queues;
1542 }
1543
1544 static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
1545 {
1546         unsigned int nr_io_queues;
1547         int ret;
1548
1549         nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1550         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1551         if (ret)
1552                 return ret;
1553
1554         ctrl->queue_count = nr_io_queues + 1;
1555         if (ctrl->queue_count < 2)
1556                 return 0;
1557
1558         dev_info(ctrl->device,
1559                 "creating %d I/O queues.\n", nr_io_queues);
1560
1561         return nvme_tcp_alloc_io_queues(ctrl);
1562 }
1563
1564 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1565 {
1566         nvme_tcp_stop_io_queues(ctrl);
1567         if (remove) {
1568                 blk_cleanup_queue(ctrl->connect_q);
1569                 blk_mq_free_tag_set(ctrl->tagset);
1570         }
1571         nvme_tcp_free_io_queues(ctrl);
1572 }
1573
1574 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1575 {
1576         int ret;
1577
1578         ret = nvme_alloc_io_queues(ctrl);
1579         if (ret)
1580                 return ret;
1581
1582         if (new) {
1583                 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1584                 if (IS_ERR(ctrl->tagset)) {
1585                         ret = PTR_ERR(ctrl->tagset);
1586                         goto out_free_io_queues;
1587                 }
1588
1589                 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1590                 if (IS_ERR(ctrl->connect_q)) {
1591                         ret = PTR_ERR(ctrl->connect_q);
1592                         goto out_free_tag_set;
1593                 }
1594         } else {
1595                 blk_mq_update_nr_hw_queues(ctrl->tagset,
1596                         ctrl->queue_count - 1);
1597         }
1598
1599         ret = nvme_tcp_start_io_queues(ctrl);
1600         if (ret)
1601                 goto out_cleanup_connect_q;
1602
1603         return 0;
1604
1605 out_cleanup_connect_q:
1606         if (new)
1607                 blk_cleanup_queue(ctrl->connect_q);
1608 out_free_tag_set:
1609         if (new)
1610                 blk_mq_free_tag_set(ctrl->tagset);
1611 out_free_io_queues:
1612         nvme_tcp_free_io_queues(ctrl);
1613         return ret;
1614 }
1615
1616 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1617 {
1618         nvme_tcp_stop_queue(ctrl, 0);
1619         if (remove) {
1620                 blk_cleanup_queue(ctrl->admin_q);
1621                 blk_mq_free_tag_set(ctrl->admin_tagset);
1622         }
1623         nvme_tcp_free_admin_queue(ctrl);
1624 }
1625
1626 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1627 {
1628         int error;
1629
1630         error = nvme_tcp_alloc_admin_queue(ctrl);
1631         if (error)
1632                 return error;
1633
1634         if (new) {
1635                 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1636                 if (IS_ERR(ctrl->admin_tagset)) {
1637                         error = PTR_ERR(ctrl->admin_tagset);
1638                         goto out_free_queue;
1639                 }
1640
1641                 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1642                 if (IS_ERR(ctrl->admin_q)) {
1643                         error = PTR_ERR(ctrl->admin_q);
1644                         goto out_free_tagset;
1645                 }
1646         }
1647
1648         error = nvme_tcp_start_queue(ctrl, 0);
1649         if (error)
1650                 goto out_cleanup_queue;
1651
1652         error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
1653         if (error) {
1654                 dev_err(ctrl->device,
1655                         "prop_get NVME_REG_CAP failed\n");
1656                 goto out_stop_queue;
1657         }
1658
1659         ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
1660
1661         error = nvme_enable_ctrl(ctrl, ctrl->cap);
1662         if (error)
1663                 goto out_stop_queue;
1664
1665         error = nvme_init_identify(ctrl);
1666         if (error)
1667                 goto out_stop_queue;
1668
1669         return 0;
1670
1671 out_stop_queue:
1672         nvme_tcp_stop_queue(ctrl, 0);
1673 out_cleanup_queue:
1674         if (new)
1675                 blk_cleanup_queue(ctrl->admin_q);
1676 out_free_tagset:
1677         if (new)
1678                 blk_mq_free_tag_set(ctrl->admin_tagset);
1679 out_free_queue:
1680         nvme_tcp_free_admin_queue(ctrl);
1681         return error;
1682 }
1683
1684 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1685                 bool remove)
1686 {
1687         blk_mq_quiesce_queue(ctrl->admin_q);
1688         nvme_tcp_stop_queue(ctrl, 0);
1689         blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
1690         blk_mq_unquiesce_queue(ctrl->admin_q);
1691         nvme_tcp_destroy_admin_queue(ctrl, remove);
1692 }
1693
1694 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1695                 bool remove)
1696 {
1697         if (ctrl->queue_count <= 1)
1698                 return;
1699         nvme_stop_queues(ctrl);
1700         nvme_tcp_stop_io_queues(ctrl);
1701         blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
1702         if (remove)
1703                 nvme_start_queues(ctrl);
1704         nvme_tcp_destroy_io_queues(ctrl, remove);
1705 }
1706
1707 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1708 {
1709         /* If we are resetting/deleting then do nothing */
1710         if (ctrl->state != NVME_CTRL_CONNECTING) {
1711                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1712                         ctrl->state == NVME_CTRL_LIVE);
1713                 return;
1714         }
1715
1716         if (nvmf_should_reconnect(ctrl)) {
1717                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1718                         ctrl->opts->reconnect_delay);
1719                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1720                                 ctrl->opts->reconnect_delay * HZ);
1721         } else {
1722                 dev_info(ctrl->device, "Removing controller...\n");
1723                 nvme_delete_ctrl(ctrl);
1724         }
1725 }
1726
1727 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1728 {
1729         struct nvmf_ctrl_options *opts = ctrl->opts;
1730         int ret = -EINVAL;
1731
1732         ret = nvme_tcp_configure_admin_queue(ctrl, new);
1733         if (ret)
1734                 return ret;
1735
1736         if (ctrl->icdoff) {
1737                 dev_err(ctrl->device, "icdoff is not supported!\n");
1738                 goto destroy_admin;
1739         }
1740
1741         if (opts->queue_size > ctrl->sqsize + 1)
1742                 dev_warn(ctrl->device,
1743                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
1744                         opts->queue_size, ctrl->sqsize + 1);
1745
1746         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1747                 dev_warn(ctrl->device,
1748                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
1749                         ctrl->sqsize + 1, ctrl->maxcmd);
1750                 ctrl->sqsize = ctrl->maxcmd - 1;
1751         }
1752
1753         if (ctrl->queue_count > 1) {
1754                 ret = nvme_tcp_configure_io_queues(ctrl, new);
1755                 if (ret)
1756                         goto destroy_admin;
1757         }
1758
1759         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1760                 /* state change failure is ok if we're in DELETING state */
1761                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1762                 ret = -EINVAL;
1763                 goto destroy_io;
1764         }
1765
1766         nvme_start_ctrl(ctrl);
1767         return 0;
1768
1769 destroy_io:
1770         if (ctrl->queue_count > 1)
1771                 nvme_tcp_destroy_io_queues(ctrl, new);
1772 destroy_admin:
1773         nvme_tcp_stop_queue(ctrl, 0);
1774         nvme_tcp_destroy_admin_queue(ctrl, new);
1775         return ret;
1776 }
1777
1778 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1779 {
1780         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1781                         struct nvme_tcp_ctrl, connect_work);
1782         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1783
1784         ++ctrl->nr_reconnects;
1785
1786         if (nvme_tcp_setup_ctrl(ctrl, false))
1787                 goto requeue;
1788
1789         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
1790                         ctrl->nr_reconnects);
1791
1792         ctrl->nr_reconnects = 0;
1793
1794         return;
1795
1796 requeue:
1797         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
1798                         ctrl->nr_reconnects);
1799         nvme_tcp_reconnect_or_remove(ctrl);
1800 }
1801
1802 static void nvme_tcp_error_recovery_work(struct work_struct *work)
1803 {
1804         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
1805                                 struct nvme_tcp_ctrl, err_work);
1806         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1807
1808         nvme_stop_keep_alive(ctrl);
1809         nvme_tcp_teardown_io_queues(ctrl, false);
1810         /* unquiesce to fail fast pending requests */
1811         nvme_start_queues(ctrl);
1812         nvme_tcp_teardown_admin_queue(ctrl, false);
1813
1814         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1815                 /* state change failure is ok if we're in DELETING state */
1816                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1817                 return;
1818         }
1819
1820         nvme_tcp_reconnect_or_remove(ctrl);
1821 }
1822
1823 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
1824 {
1825         cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
1826         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
1827
1828         nvme_tcp_teardown_io_queues(ctrl, shutdown);
1829         if (shutdown)
1830                 nvme_shutdown_ctrl(ctrl);
1831         else
1832                 nvme_disable_ctrl(ctrl, ctrl->cap);
1833         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
1834 }
1835
1836 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
1837 {
1838         nvme_tcp_teardown_ctrl(ctrl, true);
1839 }
1840
1841 static void nvme_reset_ctrl_work(struct work_struct *work)
1842 {
1843         struct nvme_ctrl *ctrl =
1844                 container_of(work, struct nvme_ctrl, reset_work);
1845
1846         nvme_stop_ctrl(ctrl);
1847         nvme_tcp_teardown_ctrl(ctrl, false);
1848
1849         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1850                 /* state change failure is ok if we're in DELETING state */
1851                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1852                 return;
1853         }
1854
1855         if (nvme_tcp_setup_ctrl(ctrl, false))
1856                 goto out_fail;
1857
1858         return;
1859
1860 out_fail:
1861         ++ctrl->nr_reconnects;
1862         nvme_tcp_reconnect_or_remove(ctrl);
1863 }
1864
1865 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
1866 {
1867         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1868
1869         if (list_empty(&ctrl->list))
1870                 goto free_ctrl;
1871
1872         mutex_lock(&nvme_tcp_ctrl_mutex);
1873         list_del(&ctrl->list);
1874         mutex_unlock(&nvme_tcp_ctrl_mutex);
1875
1876         nvmf_free_options(nctrl->opts);
1877 free_ctrl:
1878         kfree(ctrl->queues);
1879         kfree(ctrl);
1880 }
1881
1882 static void nvme_tcp_set_sg_null(struct nvme_command *c)
1883 {
1884         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1885
1886         sg->addr = 0;
1887         sg->length = 0;
1888         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1889                         NVME_SGL_FMT_TRANSPORT_A;
1890 }
1891
1892 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
1893                 struct nvme_command *c, u32 data_len)
1894 {
1895         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1896
1897         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1898         sg->length = cpu_to_le32(data_len);
1899         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1900 }
1901
1902 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
1903                 u32 data_len)
1904 {
1905         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1906
1907         sg->addr = 0;
1908         sg->length = cpu_to_le32(data_len);
1909         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1910                         NVME_SGL_FMT_TRANSPORT_A;
1911 }
1912
1913 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
1914 {
1915         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
1916         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1917         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
1918         struct nvme_command *cmd = &pdu->cmd;
1919         u8 hdgst = nvme_tcp_hdgst_len(queue);
1920
1921         memset(pdu, 0, sizeof(*pdu));
1922         pdu->hdr.type = nvme_tcp_cmd;
1923         if (queue->hdr_digest)
1924                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
1925         pdu->hdr.hlen = sizeof(*pdu);
1926         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
1927
1928         cmd->common.opcode = nvme_admin_async_event;
1929         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1930         cmd->common.flags |= NVME_CMD_SGL_METABUF;
1931         nvme_tcp_set_sg_null(cmd);
1932
1933         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
1934         ctrl->async_req.offset = 0;
1935         ctrl->async_req.curr_bio = NULL;
1936         ctrl->async_req.data_len = 0;
1937
1938         nvme_tcp_queue_request(&ctrl->async_req);
1939 }
1940
1941 static enum blk_eh_timer_return
1942 nvme_tcp_timeout(struct request *rq, bool reserved)
1943 {
1944         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1945         struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
1946         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1947
1948         dev_warn(ctrl->ctrl.device,
1949                 "queue %d: timeout request %#x type %d\n",
1950                 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
1951
1952         if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
1953                 /*
1954                  * Teardown immediately if controller times out while starting
1955                  * or we are already started error recovery. all outstanding
1956                  * requests are completed on shutdown, so we return BLK_EH_DONE.
1957                  */
1958                 flush_work(&ctrl->err_work);
1959                 nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
1960                 nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
1961                 return BLK_EH_DONE;
1962         }
1963
1964         dev_warn(ctrl->ctrl.device, "starting error recovery\n");
1965         nvme_tcp_error_recovery(&ctrl->ctrl);
1966
1967         return BLK_EH_RESET_TIMER;
1968 }
1969
1970 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
1971                         struct request *rq)
1972 {
1973         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1974         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1975         struct nvme_command *c = &pdu->cmd;
1976
1977         c->common.flags |= NVME_CMD_SGL_METABUF;
1978
1979         if (rq_data_dir(rq) == WRITE && req->data_len &&
1980             req->data_len <= nvme_tcp_inline_data_size(queue))
1981                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
1982         else
1983                 nvme_tcp_set_sg_host_data(c, req->data_len);
1984
1985         return 0;
1986 }
1987
1988 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
1989                 struct request *rq)
1990 {
1991         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1992         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1993         struct nvme_tcp_queue *queue = req->queue;
1994         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
1995         blk_status_t ret;
1996
1997         ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
1998         if (ret)
1999                 return ret;
2000
2001         req->state = NVME_TCP_SEND_CMD_PDU;
2002         req->offset = 0;
2003         req->data_sent = 0;
2004         req->pdu_len = 0;
2005         req->pdu_sent = 0;
2006         req->data_len = blk_rq_payload_bytes(rq);
2007         req->curr_bio = rq->bio;
2008
2009         if (rq_data_dir(rq) == WRITE &&
2010             req->data_len <= nvme_tcp_inline_data_size(queue))
2011                 req->pdu_len = req->data_len;
2012         else if (req->curr_bio)
2013                 nvme_tcp_init_iter(req, READ);
2014
2015         pdu->hdr.type = nvme_tcp_cmd;
2016         pdu->hdr.flags = 0;
2017         if (queue->hdr_digest)
2018                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2019         if (queue->data_digest && req->pdu_len) {
2020                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2021                 ddgst = nvme_tcp_ddgst_len(queue);
2022         }
2023         pdu->hdr.hlen = sizeof(*pdu);
2024         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2025         pdu->hdr.plen =
2026                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2027
2028         ret = nvme_tcp_map_data(queue, rq);
2029         if (unlikely(ret)) {
2030                 dev_err(queue->ctrl->ctrl.device,
2031                         "Failed to map data (%d)\n", ret);
2032                 return ret;
2033         }
2034
2035         return 0;
2036 }
2037
2038 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2039                 const struct blk_mq_queue_data *bd)
2040 {
2041         struct nvme_ns *ns = hctx->queue->queuedata;
2042         struct nvme_tcp_queue *queue = hctx->driver_data;
2043         struct request *rq = bd->rq;
2044         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2045         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2046         blk_status_t ret;
2047
2048         if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2049                 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2050
2051         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2052         if (unlikely(ret))
2053                 return ret;
2054
2055         blk_mq_start_request(rq);
2056
2057         nvme_tcp_queue_request(req);
2058
2059         return BLK_STS_OK;
2060 }
2061
2062 static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2063 {
2064         struct nvme_tcp_ctrl *ctrl = set->driver_data;
2065
2066         set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2067         set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
2068         if (ctrl->ctrl.opts->nr_write_queues) {
2069                 /* separate read/write queues */
2070                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2071                                 ctrl->ctrl.opts->nr_write_queues;
2072                 set->map[HCTX_TYPE_READ].queue_offset =
2073                                 ctrl->ctrl.opts->nr_write_queues;
2074         } else {
2075                 /* mixed read/write queues */
2076                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2077                                 ctrl->ctrl.opts->nr_io_queues;
2078                 set->map[HCTX_TYPE_READ].queue_offset = 0;
2079         }
2080         blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2081         blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2082         return 0;
2083 }
2084
2085 static struct blk_mq_ops nvme_tcp_mq_ops = {
2086         .queue_rq       = nvme_tcp_queue_rq,
2087         .complete       = nvme_complete_rq,
2088         .init_request   = nvme_tcp_init_request,
2089         .exit_request   = nvme_tcp_exit_request,
2090         .init_hctx      = nvme_tcp_init_hctx,
2091         .timeout        = nvme_tcp_timeout,
2092         .map_queues     = nvme_tcp_map_queues,
2093 };
2094
2095 static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2096         .queue_rq       = nvme_tcp_queue_rq,
2097         .complete       = nvme_complete_rq,
2098         .init_request   = nvme_tcp_init_request,
2099         .exit_request   = nvme_tcp_exit_request,
2100         .init_hctx      = nvme_tcp_init_admin_hctx,
2101         .timeout        = nvme_tcp_timeout,
2102 };
2103
2104 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2105         .name                   = "tcp",
2106         .module                 = THIS_MODULE,
2107         .flags                  = NVME_F_FABRICS,
2108         .reg_read32             = nvmf_reg_read32,
2109         .reg_read64             = nvmf_reg_read64,
2110         .reg_write32            = nvmf_reg_write32,
2111         .free_ctrl              = nvme_tcp_free_ctrl,
2112         .submit_async_event     = nvme_tcp_submit_async_event,
2113         .delete_ctrl            = nvme_tcp_delete_ctrl,
2114         .get_address            = nvmf_get_address,
2115 };
2116
2117 static bool
2118 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2119 {
2120         struct nvme_tcp_ctrl *ctrl;
2121         bool found = false;
2122
2123         mutex_lock(&nvme_tcp_ctrl_mutex);
2124         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2125                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2126                 if (found)
2127                         break;
2128         }
2129         mutex_unlock(&nvme_tcp_ctrl_mutex);
2130
2131         return found;
2132 }
2133
2134 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2135                 struct nvmf_ctrl_options *opts)
2136 {
2137         struct nvme_tcp_ctrl *ctrl;
2138         int ret;
2139
2140         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2141         if (!ctrl)
2142                 return ERR_PTR(-ENOMEM);
2143
2144         INIT_LIST_HEAD(&ctrl->list);
2145         ctrl->ctrl.opts = opts;
2146         ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
2147         ctrl->ctrl.sqsize = opts->queue_size - 1;
2148         ctrl->ctrl.kato = opts->kato;
2149
2150         INIT_DELAYED_WORK(&ctrl->connect_work,
2151                         nvme_tcp_reconnect_ctrl_work);
2152         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2153         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2154
2155         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2156                 opts->trsvcid =
2157                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2158                 if (!opts->trsvcid) {
2159                         ret = -ENOMEM;
2160                         goto out_free_ctrl;
2161                 }
2162                 opts->mask |= NVMF_OPT_TRSVCID;
2163         }
2164
2165         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2166                         opts->traddr, opts->trsvcid, &ctrl->addr);
2167         if (ret) {
2168                 pr_err("malformed address passed: %s:%s\n",
2169                         opts->traddr, opts->trsvcid);
2170                 goto out_free_ctrl;
2171         }
2172
2173         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2174                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2175                         opts->host_traddr, NULL, &ctrl->src_addr);
2176                 if (ret) {
2177                         pr_err("malformed src address passed: %s\n",
2178                                opts->host_traddr);
2179                         goto out_free_ctrl;
2180                 }
2181         }
2182
2183         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2184                 ret = -EALREADY;
2185                 goto out_free_ctrl;
2186         }
2187
2188         ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2189                                 GFP_KERNEL);
2190         if (!ctrl->queues) {
2191                 ret = -ENOMEM;
2192                 goto out_free_ctrl;
2193         }
2194
2195         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2196         if (ret)
2197                 goto out_kfree_queues;
2198
2199         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2200                 WARN_ON_ONCE(1);
2201                 ret = -EINTR;
2202                 goto out_uninit_ctrl;
2203         }
2204
2205         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2206         if (ret)
2207                 goto out_uninit_ctrl;
2208
2209         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2210                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2211
2212         nvme_get_ctrl(&ctrl->ctrl);
2213
2214         mutex_lock(&nvme_tcp_ctrl_mutex);
2215         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2216         mutex_unlock(&nvme_tcp_ctrl_mutex);
2217
2218         return &ctrl->ctrl;
2219
2220 out_uninit_ctrl:
2221         nvme_uninit_ctrl(&ctrl->ctrl);
2222         nvme_put_ctrl(&ctrl->ctrl);
2223         if (ret > 0)
2224                 ret = -EIO;
2225         return ERR_PTR(ret);
2226 out_kfree_queues:
2227         kfree(ctrl->queues);
2228 out_free_ctrl:
2229         kfree(ctrl);
2230         return ERR_PTR(ret);
2231 }
2232
2233 static struct nvmf_transport_ops nvme_tcp_transport = {
2234         .name           = "tcp",
2235         .module         = THIS_MODULE,
2236         .required_opts  = NVMF_OPT_TRADDR,
2237         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2238                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2239                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2240                           NVMF_OPT_NR_WRITE_QUEUES,
2241         .create_ctrl    = nvme_tcp_create_ctrl,
2242 };
2243
2244 static int __init nvme_tcp_init_module(void)
2245 {
2246         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2247                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2248         if (!nvme_tcp_wq)
2249                 return -ENOMEM;
2250
2251         nvmf_register_transport(&nvme_tcp_transport);
2252         return 0;
2253 }
2254
2255 static void __exit nvme_tcp_cleanup_module(void)
2256 {
2257         struct nvme_tcp_ctrl *ctrl;
2258
2259         nvmf_unregister_transport(&nvme_tcp_transport);
2260
2261         mutex_lock(&nvme_tcp_ctrl_mutex);
2262         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2263                 nvme_delete_ctrl(&ctrl->ctrl);
2264         mutex_unlock(&nvme_tcp_ctrl_mutex);
2265         flush_workqueue(nvme_delete_wq);
2266
2267         destroy_workqueue(nvme_tcp_wq);
2268 }
2269
2270 module_init(nvme_tcp_init_module);
2271 module_exit(nvme_tcp_cleanup_module);
2272
2273 MODULE_LICENSE("GPL v2");