Merge tag 'drm-misc-fixes-2020-03-26' of git://anongit.freedesktop.org/drm/drm-misc...
[linux-2.6-microblaze.git] / drivers / nvme / target / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP target.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
17
18 #include "nvmet.h"
19
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE  (4 * PAGE_SIZE)
21
22 #define NVMET_TCP_RECV_BUDGET           8
23 #define NVMET_TCP_SEND_BUDGET           8
24 #define NVMET_TCP_IO_WORK_BUDGET        64
25
26 enum nvmet_tcp_send_state {
27         NVMET_TCP_SEND_DATA_PDU,
28         NVMET_TCP_SEND_DATA,
29         NVMET_TCP_SEND_R2T,
30         NVMET_TCP_SEND_DDGST,
31         NVMET_TCP_SEND_RESPONSE
32 };
33
34 enum nvmet_tcp_recv_state {
35         NVMET_TCP_RECV_PDU,
36         NVMET_TCP_RECV_DATA,
37         NVMET_TCP_RECV_DDGST,
38         NVMET_TCP_RECV_ERR,
39 };
40
41 enum {
42         NVMET_TCP_F_INIT_FAILED = (1 << 0),
43 };
44
45 struct nvmet_tcp_cmd {
46         struct nvmet_tcp_queue          *queue;
47         struct nvmet_req                req;
48
49         struct nvme_tcp_cmd_pdu         *cmd_pdu;
50         struct nvme_tcp_rsp_pdu         *rsp_pdu;
51         struct nvme_tcp_data_pdu        *data_pdu;
52         struct nvme_tcp_r2t_pdu         *r2t_pdu;
53
54         u32                             rbytes_done;
55         u32                             wbytes_done;
56
57         u32                             pdu_len;
58         u32                             pdu_recv;
59         int                             sg_idx;
60         int                             nr_mapped;
61         struct msghdr                   recv_msg;
62         struct kvec                     *iov;
63         u32                             flags;
64
65         struct list_head                entry;
66         struct llist_node               lentry;
67
68         /* send state */
69         u32                             offset;
70         struct scatterlist              *cur_sg;
71         enum nvmet_tcp_send_state       state;
72
73         __le32                          exp_ddgst;
74         __le32                          recv_ddgst;
75 };
76
77 enum nvmet_tcp_queue_state {
78         NVMET_TCP_Q_CONNECTING,
79         NVMET_TCP_Q_LIVE,
80         NVMET_TCP_Q_DISCONNECTING,
81 };
82
83 struct nvmet_tcp_queue {
84         struct socket           *sock;
85         struct nvmet_tcp_port   *port;
86         struct work_struct      io_work;
87         int                     cpu;
88         struct nvmet_cq         nvme_cq;
89         struct nvmet_sq         nvme_sq;
90
91         /* send state */
92         struct nvmet_tcp_cmd    *cmds;
93         unsigned int            nr_cmds;
94         struct list_head        free_list;
95         struct llist_head       resp_list;
96         struct list_head        resp_send_list;
97         int                     send_list_len;
98         struct nvmet_tcp_cmd    *snd_cmd;
99
100         /* recv state */
101         int                     offset;
102         int                     left;
103         enum nvmet_tcp_recv_state rcv_state;
104         struct nvmet_tcp_cmd    *cmd;
105         union nvme_tcp_pdu      pdu;
106
107         /* digest state */
108         bool                    hdr_digest;
109         bool                    data_digest;
110         struct ahash_request    *snd_hash;
111         struct ahash_request    *rcv_hash;
112
113         spinlock_t              state_lock;
114         enum nvmet_tcp_queue_state state;
115
116         struct sockaddr_storage sockaddr;
117         struct sockaddr_storage sockaddr_peer;
118         struct work_struct      release_work;
119
120         int                     idx;
121         struct list_head        queue_list;
122
123         struct nvmet_tcp_cmd    connect;
124
125         struct page_frag_cache  pf_cache;
126
127         void (*data_ready)(struct sock *);
128         void (*state_change)(struct sock *);
129         void (*write_space)(struct sock *);
130 };
131
132 struct nvmet_tcp_port {
133         struct socket           *sock;
134         struct work_struct      accept_work;
135         struct nvmet_port       *nport;
136         struct sockaddr_storage addr;
137         int                     last_cpu;
138         void (*data_ready)(struct sock *);
139 };
140
141 static DEFINE_IDA(nvmet_tcp_queue_ida);
142 static LIST_HEAD(nvmet_tcp_queue_list);
143 static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
144
145 static struct workqueue_struct *nvmet_tcp_wq;
146 static struct nvmet_fabrics_ops nvmet_tcp_ops;
147 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
148 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
149
150 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
151                 struct nvmet_tcp_cmd *cmd)
152 {
153         return cmd - queue->cmds;
154 }
155
156 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
157 {
158         return nvme_is_write(cmd->req.cmd) &&
159                 cmd->rbytes_done < cmd->req.transfer_len;
160 }
161
162 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
163 {
164         return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
165 }
166
167 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
168 {
169         return !nvme_is_write(cmd->req.cmd) &&
170                 cmd->req.transfer_len > 0 &&
171                 !cmd->req.cqe->status;
172 }
173
174 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
175 {
176         return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
177                 !cmd->rbytes_done;
178 }
179
180 static inline struct nvmet_tcp_cmd *
181 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
182 {
183         struct nvmet_tcp_cmd *cmd;
184
185         cmd = list_first_entry_or_null(&queue->free_list,
186                                 struct nvmet_tcp_cmd, entry);
187         if (!cmd)
188                 return NULL;
189         list_del_init(&cmd->entry);
190
191         cmd->rbytes_done = cmd->wbytes_done = 0;
192         cmd->pdu_len = 0;
193         cmd->pdu_recv = 0;
194         cmd->iov = NULL;
195         cmd->flags = 0;
196         return cmd;
197 }
198
199 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
200 {
201         if (unlikely(cmd == &cmd->queue->connect))
202                 return;
203
204         list_add_tail(&cmd->entry, &cmd->queue->free_list);
205 }
206
207 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
208 {
209         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
210 }
211
212 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
213 {
214         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
215 }
216
217 static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
218                 void *pdu, size_t len)
219 {
220         struct scatterlist sg;
221
222         sg_init_one(&sg, pdu, len);
223         ahash_request_set_crypt(hash, &sg, pdu + len, len);
224         crypto_ahash_digest(hash);
225 }
226
227 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
228         void *pdu, size_t len)
229 {
230         struct nvme_tcp_hdr *hdr = pdu;
231         __le32 recv_digest;
232         __le32 exp_digest;
233
234         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
235                 pr_err("queue %d: header digest enabled but no header digest\n",
236                         queue->idx);
237                 return -EPROTO;
238         }
239
240         recv_digest = *(__le32 *)(pdu + hdr->hlen);
241         nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
242         exp_digest = *(__le32 *)(pdu + hdr->hlen);
243         if (recv_digest != exp_digest) {
244                 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
245                         queue->idx, le32_to_cpu(recv_digest),
246                         le32_to_cpu(exp_digest));
247                 return -EPROTO;
248         }
249
250         return 0;
251 }
252
253 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
254 {
255         struct nvme_tcp_hdr *hdr = pdu;
256         u8 digest_len = nvmet_tcp_hdgst_len(queue);
257         u32 len;
258
259         len = le32_to_cpu(hdr->plen) - hdr->hlen -
260                 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
261
262         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
263                 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
264                 return -EPROTO;
265         }
266
267         return 0;
268 }
269
270 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
271 {
272         struct scatterlist *sg;
273         int i;
274
275         sg = &cmd->req.sg[cmd->sg_idx];
276
277         for (i = 0; i < cmd->nr_mapped; i++)
278                 kunmap(sg_page(&sg[i]));
279 }
280
281 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
282 {
283         struct kvec *iov = cmd->iov;
284         struct scatterlist *sg;
285         u32 length, offset, sg_offset;
286
287         length = cmd->pdu_len;
288         cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
289         offset = cmd->rbytes_done;
290         cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
291         sg_offset = offset % PAGE_SIZE;
292         sg = &cmd->req.sg[cmd->sg_idx];
293
294         while (length) {
295                 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
296
297                 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
298                 iov->iov_len = iov_len;
299
300                 length -= iov_len;
301                 sg = sg_next(sg);
302                 iov++;
303         }
304
305         iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
306                 cmd->nr_mapped, cmd->pdu_len);
307 }
308
309 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
310 {
311         queue->rcv_state = NVMET_TCP_RECV_ERR;
312         if (queue->nvme_sq.ctrl)
313                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
314         else
315                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
316 }
317
318 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
319 {
320         struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
321         u32 len = le32_to_cpu(sgl->length);
322
323         if (!len)
324                 return 0;
325
326         if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
327                           NVME_SGL_FMT_OFFSET)) {
328                 if (!nvme_is_write(cmd->req.cmd))
329                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
330
331                 if (len > cmd->req.port->inline_data_size)
332                         return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
333                 cmd->pdu_len = len;
334         }
335         cmd->req.transfer_len += len;
336
337         cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
338         if (!cmd->req.sg)
339                 return NVME_SC_INTERNAL;
340         cmd->cur_sg = cmd->req.sg;
341
342         if (nvmet_tcp_has_data_in(cmd)) {
343                 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
344                                 sizeof(*cmd->iov), GFP_KERNEL);
345                 if (!cmd->iov)
346                         goto err;
347         }
348
349         return 0;
350 err:
351         sgl_free(cmd->req.sg);
352         return NVME_SC_INTERNAL;
353 }
354
355 static void nvmet_tcp_ddgst(struct ahash_request *hash,
356                 struct nvmet_tcp_cmd *cmd)
357 {
358         ahash_request_set_crypt(hash, cmd->req.sg,
359                 (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
360         crypto_ahash_digest(hash);
361 }
362
363 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
364 {
365         struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
366         struct nvmet_tcp_queue *queue = cmd->queue;
367         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
368         u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
369
370         cmd->offset = 0;
371         cmd->state = NVMET_TCP_SEND_DATA_PDU;
372
373         pdu->hdr.type = nvme_tcp_c2h_data;
374         pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
375                                                 NVME_TCP_F_DATA_SUCCESS : 0);
376         pdu->hdr.hlen = sizeof(*pdu);
377         pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
378         pdu->hdr.plen =
379                 cpu_to_le32(pdu->hdr.hlen + hdgst +
380                                 cmd->req.transfer_len + ddgst);
381         pdu->command_id = cmd->req.cqe->command_id;
382         pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
383         pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
384
385         if (queue->data_digest) {
386                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
387                 nvmet_tcp_ddgst(queue->snd_hash, cmd);
388         }
389
390         if (cmd->queue->hdr_digest) {
391                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
392                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
393         }
394 }
395
396 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
397 {
398         struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
399         struct nvmet_tcp_queue *queue = cmd->queue;
400         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
401
402         cmd->offset = 0;
403         cmd->state = NVMET_TCP_SEND_R2T;
404
405         pdu->hdr.type = nvme_tcp_r2t;
406         pdu->hdr.flags = 0;
407         pdu->hdr.hlen = sizeof(*pdu);
408         pdu->hdr.pdo = 0;
409         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
410
411         pdu->command_id = cmd->req.cmd->common.command_id;
412         pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
413         pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
414         pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
415         if (cmd->queue->hdr_digest) {
416                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
417                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
418         }
419 }
420
421 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
422 {
423         struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
424         struct nvmet_tcp_queue *queue = cmd->queue;
425         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
426
427         cmd->offset = 0;
428         cmd->state = NVMET_TCP_SEND_RESPONSE;
429
430         pdu->hdr.type = nvme_tcp_rsp;
431         pdu->hdr.flags = 0;
432         pdu->hdr.hlen = sizeof(*pdu);
433         pdu->hdr.pdo = 0;
434         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
435         if (cmd->queue->hdr_digest) {
436                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
437                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
438         }
439 }
440
441 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
442 {
443         struct llist_node *node;
444
445         node = llist_del_all(&queue->resp_list);
446         if (!node)
447                 return;
448
449         while (node) {
450                 struct nvmet_tcp_cmd *cmd = llist_entry(node,
451                                         struct nvmet_tcp_cmd, lentry);
452
453                 list_add(&cmd->entry, &queue->resp_send_list);
454                 node = node->next;
455                 queue->send_list_len++;
456         }
457 }
458
459 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
460 {
461         queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
462                                 struct nvmet_tcp_cmd, entry);
463         if (!queue->snd_cmd) {
464                 nvmet_tcp_process_resp_list(queue);
465                 queue->snd_cmd =
466                         list_first_entry_or_null(&queue->resp_send_list,
467                                         struct nvmet_tcp_cmd, entry);
468                 if (unlikely(!queue->snd_cmd))
469                         return NULL;
470         }
471
472         list_del_init(&queue->snd_cmd->entry);
473         queue->send_list_len--;
474
475         if (nvmet_tcp_need_data_out(queue->snd_cmd))
476                 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
477         else if (nvmet_tcp_need_data_in(queue->snd_cmd))
478                 nvmet_setup_r2t_pdu(queue->snd_cmd);
479         else
480                 nvmet_setup_response_pdu(queue->snd_cmd);
481
482         return queue->snd_cmd;
483 }
484
485 static void nvmet_tcp_queue_response(struct nvmet_req *req)
486 {
487         struct nvmet_tcp_cmd *cmd =
488                 container_of(req, struct nvmet_tcp_cmd, req);
489         struct nvmet_tcp_queue  *queue = cmd->queue;
490
491         llist_add(&cmd->lentry, &queue->resp_list);
492         queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
493 }
494
495 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
496 {
497         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
498         int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
499         int ret;
500
501         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
502                         offset_in_page(cmd->data_pdu) + cmd->offset,
503                         left, MSG_DONTWAIT | MSG_MORE);
504         if (ret <= 0)
505                 return ret;
506
507         cmd->offset += ret;
508         left -= ret;
509
510         if (left)
511                 return -EAGAIN;
512
513         cmd->state = NVMET_TCP_SEND_DATA;
514         cmd->offset  = 0;
515         return 1;
516 }
517
518 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
519 {
520         struct nvmet_tcp_queue *queue = cmd->queue;
521         int ret;
522
523         while (cmd->cur_sg) {
524                 struct page *page = sg_page(cmd->cur_sg);
525                 u32 left = cmd->cur_sg->length - cmd->offset;
526                 int flags = MSG_DONTWAIT;
527
528                 if ((!last_in_batch && cmd->queue->send_list_len) ||
529                     cmd->wbytes_done + left < cmd->req.transfer_len ||
530                     queue->data_digest || !queue->nvme_sq.sqhd_disabled)
531                         flags |= MSG_MORE;
532
533                 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
534                                         left, flags);
535                 if (ret <= 0)
536                         return ret;
537
538                 cmd->offset += ret;
539                 cmd->wbytes_done += ret;
540
541                 /* Done with sg?*/
542                 if (cmd->offset == cmd->cur_sg->length) {
543                         cmd->cur_sg = sg_next(cmd->cur_sg);
544                         cmd->offset = 0;
545                 }
546         }
547
548         if (queue->data_digest) {
549                 cmd->state = NVMET_TCP_SEND_DDGST;
550                 cmd->offset = 0;
551         } else {
552                 if (queue->nvme_sq.sqhd_disabled) {
553                         cmd->queue->snd_cmd = NULL;
554                         nvmet_tcp_put_cmd(cmd);
555                 } else {
556                         nvmet_setup_response_pdu(cmd);
557                 }
558         }
559
560         if (queue->nvme_sq.sqhd_disabled) {
561                 kfree(cmd->iov);
562                 sgl_free(cmd->req.sg);
563         }
564
565         return 1;
566
567 }
568
569 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
570                 bool last_in_batch)
571 {
572         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
573         int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
574         int flags = MSG_DONTWAIT;
575         int ret;
576
577         if (!last_in_batch && cmd->queue->send_list_len)
578                 flags |= MSG_MORE;
579         else
580                 flags |= MSG_EOR;
581
582         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
583                 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
584         if (ret <= 0)
585                 return ret;
586         cmd->offset += ret;
587         left -= ret;
588
589         if (left)
590                 return -EAGAIN;
591
592         kfree(cmd->iov);
593         sgl_free(cmd->req.sg);
594         cmd->queue->snd_cmd = NULL;
595         nvmet_tcp_put_cmd(cmd);
596         return 1;
597 }
598
599 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
600 {
601         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
602         int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
603         int flags = MSG_DONTWAIT;
604         int ret;
605
606         if (!last_in_batch && cmd->queue->send_list_len)
607                 flags |= MSG_MORE;
608         else
609                 flags |= MSG_EOR;
610
611         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
612                 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
613         if (ret <= 0)
614                 return ret;
615         cmd->offset += ret;
616         left -= ret;
617
618         if (left)
619                 return -EAGAIN;
620
621         cmd->queue->snd_cmd = NULL;
622         return 1;
623 }
624
625 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
626 {
627         struct nvmet_tcp_queue *queue = cmd->queue;
628         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
629         struct kvec iov = {
630                 .iov_base = &cmd->exp_ddgst + cmd->offset,
631                 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
632         };
633         int ret;
634
635         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
636         if (unlikely(ret <= 0))
637                 return ret;
638
639         cmd->offset += ret;
640
641         if (queue->nvme_sq.sqhd_disabled) {
642                 cmd->queue->snd_cmd = NULL;
643                 nvmet_tcp_put_cmd(cmd);
644         } else {
645                 nvmet_setup_response_pdu(cmd);
646         }
647         return 1;
648 }
649
650 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
651                 bool last_in_batch)
652 {
653         struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
654         int ret = 0;
655
656         if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
657                 cmd = nvmet_tcp_fetch_cmd(queue);
658                 if (unlikely(!cmd))
659                         return 0;
660         }
661
662         if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
663                 ret = nvmet_try_send_data_pdu(cmd);
664                 if (ret <= 0)
665                         goto done_send;
666         }
667
668         if (cmd->state == NVMET_TCP_SEND_DATA) {
669                 ret = nvmet_try_send_data(cmd, last_in_batch);
670                 if (ret <= 0)
671                         goto done_send;
672         }
673
674         if (cmd->state == NVMET_TCP_SEND_DDGST) {
675                 ret = nvmet_try_send_ddgst(cmd);
676                 if (ret <= 0)
677                         goto done_send;
678         }
679
680         if (cmd->state == NVMET_TCP_SEND_R2T) {
681                 ret = nvmet_try_send_r2t(cmd, last_in_batch);
682                 if (ret <= 0)
683                         goto done_send;
684         }
685
686         if (cmd->state == NVMET_TCP_SEND_RESPONSE)
687                 ret = nvmet_try_send_response(cmd, last_in_batch);
688
689 done_send:
690         if (ret < 0) {
691                 if (ret == -EAGAIN)
692                         return 0;
693                 return ret;
694         }
695
696         return 1;
697 }
698
699 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
700                 int budget, int *sends)
701 {
702         int i, ret = 0;
703
704         for (i = 0; i < budget; i++) {
705                 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
706                 if (ret <= 0)
707                         break;
708                 (*sends)++;
709         }
710
711         return ret;
712 }
713
714 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
715 {
716         queue->offset = 0;
717         queue->left = sizeof(struct nvme_tcp_hdr);
718         queue->cmd = NULL;
719         queue->rcv_state = NVMET_TCP_RECV_PDU;
720 }
721
722 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
723 {
724         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
725
726         ahash_request_free(queue->rcv_hash);
727         ahash_request_free(queue->snd_hash);
728         crypto_free_ahash(tfm);
729 }
730
731 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
732 {
733         struct crypto_ahash *tfm;
734
735         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
736         if (IS_ERR(tfm))
737                 return PTR_ERR(tfm);
738
739         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
740         if (!queue->snd_hash)
741                 goto free_tfm;
742         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
743
744         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
745         if (!queue->rcv_hash)
746                 goto free_snd_hash;
747         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
748
749         return 0;
750 free_snd_hash:
751         ahash_request_free(queue->snd_hash);
752 free_tfm:
753         crypto_free_ahash(tfm);
754         return -ENOMEM;
755 }
756
757
758 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
759 {
760         struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
761         struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
762         struct msghdr msg = {};
763         struct kvec iov;
764         int ret;
765
766         if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
767                 pr_err("bad nvme-tcp pdu length (%d)\n",
768                         le32_to_cpu(icreq->hdr.plen));
769                 nvmet_tcp_fatal_error(queue);
770         }
771
772         if (icreq->pfv != NVME_TCP_PFV_1_0) {
773                 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
774                 return -EPROTO;
775         }
776
777         if (icreq->hpda != 0) {
778                 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
779                         icreq->hpda);
780                 return -EPROTO;
781         }
782
783         queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
784         queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
785         if (queue->hdr_digest || queue->data_digest) {
786                 ret = nvmet_tcp_alloc_crypto(queue);
787                 if (ret)
788                         return ret;
789         }
790
791         memset(icresp, 0, sizeof(*icresp));
792         icresp->hdr.type = nvme_tcp_icresp;
793         icresp->hdr.hlen = sizeof(*icresp);
794         icresp->hdr.pdo = 0;
795         icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
796         icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
797         icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */
798         icresp->cpda = 0;
799         if (queue->hdr_digest)
800                 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
801         if (queue->data_digest)
802                 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
803
804         iov.iov_base = icresp;
805         iov.iov_len = sizeof(*icresp);
806         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
807         if (ret < 0)
808                 goto free_crypto;
809
810         queue->state = NVMET_TCP_Q_LIVE;
811         nvmet_prepare_receive_pdu(queue);
812         return 0;
813 free_crypto:
814         if (queue->hdr_digest || queue->data_digest)
815                 nvmet_tcp_free_crypto(queue);
816         return ret;
817 }
818
819 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
820                 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
821 {
822         size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
823         int ret;
824
825         if (!nvme_is_write(cmd->req.cmd) ||
826             data_len > cmd->req.port->inline_data_size) {
827                 nvmet_prepare_receive_pdu(queue);
828                 return;
829         }
830
831         ret = nvmet_tcp_map_data(cmd);
832         if (unlikely(ret)) {
833                 pr_err("queue %d: failed to map data\n", queue->idx);
834                 nvmet_tcp_fatal_error(queue);
835                 return;
836         }
837
838         queue->rcv_state = NVMET_TCP_RECV_DATA;
839         nvmet_tcp_map_pdu_iovec(cmd);
840         cmd->flags |= NVMET_TCP_F_INIT_FAILED;
841 }
842
843 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
844 {
845         struct nvme_tcp_data_pdu *data = &queue->pdu.data;
846         struct nvmet_tcp_cmd *cmd;
847
848         cmd = &queue->cmds[data->ttag];
849
850         if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
851                 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
852                         data->ttag, le32_to_cpu(data->data_offset),
853                         cmd->rbytes_done);
854                 /* FIXME: use path and transport errors */
855                 nvmet_req_complete(&cmd->req,
856                         NVME_SC_INVALID_FIELD | NVME_SC_DNR);
857                 return -EPROTO;
858         }
859
860         cmd->pdu_len = le32_to_cpu(data->data_length);
861         cmd->pdu_recv = 0;
862         nvmet_tcp_map_pdu_iovec(cmd);
863         queue->cmd = cmd;
864         queue->rcv_state = NVMET_TCP_RECV_DATA;
865
866         return 0;
867 }
868
869 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
870 {
871         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
872         struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
873         struct nvmet_req *req;
874         int ret;
875
876         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
877                 if (hdr->type != nvme_tcp_icreq) {
878                         pr_err("unexpected pdu type (%d) before icreq\n",
879                                 hdr->type);
880                         nvmet_tcp_fatal_error(queue);
881                         return -EPROTO;
882                 }
883                 return nvmet_tcp_handle_icreq(queue);
884         }
885
886         if (hdr->type == nvme_tcp_h2c_data) {
887                 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
888                 if (unlikely(ret))
889                         return ret;
890                 return 0;
891         }
892
893         queue->cmd = nvmet_tcp_get_cmd(queue);
894         if (unlikely(!queue->cmd)) {
895                 /* This should never happen */
896                 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
897                         queue->idx, queue->nr_cmds, queue->send_list_len,
898                         nvme_cmd->common.opcode);
899                 nvmet_tcp_fatal_error(queue);
900                 return -ENOMEM;
901         }
902
903         req = &queue->cmd->req;
904         memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
905
906         if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
907                         &queue->nvme_sq, &nvmet_tcp_ops))) {
908                 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
909                         req->cmd, req->cmd->common.command_id,
910                         req->cmd->common.opcode,
911                         le32_to_cpu(req->cmd->common.dptr.sgl.length));
912
913                 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
914                 return -EAGAIN;
915         }
916
917         ret = nvmet_tcp_map_data(queue->cmd);
918         if (unlikely(ret)) {
919                 pr_err("queue %d: failed to map data\n", queue->idx);
920                 if (nvmet_tcp_has_inline_data(queue->cmd))
921                         nvmet_tcp_fatal_error(queue);
922                 else
923                         nvmet_req_complete(req, ret);
924                 ret = -EAGAIN;
925                 goto out;
926         }
927
928         if (nvmet_tcp_need_data_in(queue->cmd)) {
929                 if (nvmet_tcp_has_inline_data(queue->cmd)) {
930                         queue->rcv_state = NVMET_TCP_RECV_DATA;
931                         nvmet_tcp_map_pdu_iovec(queue->cmd);
932                         return 0;
933                 }
934                 /* send back R2T */
935                 nvmet_tcp_queue_response(&queue->cmd->req);
936                 goto out;
937         }
938
939         queue->cmd->req.execute(&queue->cmd->req);
940 out:
941         nvmet_prepare_receive_pdu(queue);
942         return ret;
943 }
944
945 static const u8 nvme_tcp_pdu_sizes[] = {
946         [nvme_tcp_icreq]        = sizeof(struct nvme_tcp_icreq_pdu),
947         [nvme_tcp_cmd]          = sizeof(struct nvme_tcp_cmd_pdu),
948         [nvme_tcp_h2c_data]     = sizeof(struct nvme_tcp_data_pdu),
949 };
950
951 static inline u8 nvmet_tcp_pdu_size(u8 type)
952 {
953         size_t idx = type;
954
955         return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
956                 nvme_tcp_pdu_sizes[idx]) ?
957                         nvme_tcp_pdu_sizes[idx] : 0;
958 }
959
960 static inline bool nvmet_tcp_pdu_valid(u8 type)
961 {
962         switch (type) {
963         case nvme_tcp_icreq:
964         case nvme_tcp_cmd:
965         case nvme_tcp_h2c_data:
966                 /* fallthru */
967                 return true;
968         }
969
970         return false;
971 }
972
973 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
974 {
975         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
976         int len;
977         struct kvec iov;
978         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
979
980 recv:
981         iov.iov_base = (void *)&queue->pdu + queue->offset;
982         iov.iov_len = queue->left;
983         len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
984                         iov.iov_len, msg.msg_flags);
985         if (unlikely(len < 0))
986                 return len;
987
988         queue->offset += len;
989         queue->left -= len;
990         if (queue->left)
991                 return -EAGAIN;
992
993         if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
994                 u8 hdgst = nvmet_tcp_hdgst_len(queue);
995
996                 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
997                         pr_err("unexpected pdu type %d\n", hdr->type);
998                         nvmet_tcp_fatal_error(queue);
999                         return -EIO;
1000                 }
1001
1002                 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1003                         pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1004                         return -EIO;
1005                 }
1006
1007                 queue->left = hdr->hlen - queue->offset + hdgst;
1008                 goto recv;
1009         }
1010
1011         if (queue->hdr_digest &&
1012             nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
1013                 nvmet_tcp_fatal_error(queue); /* fatal */
1014                 return -EPROTO;
1015         }
1016
1017         if (queue->data_digest &&
1018             nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1019                 nvmet_tcp_fatal_error(queue); /* fatal */
1020                 return -EPROTO;
1021         }
1022
1023         return nvmet_tcp_done_recv_pdu(queue);
1024 }
1025
1026 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1027 {
1028         struct nvmet_tcp_queue *queue = cmd->queue;
1029
1030         nvmet_tcp_ddgst(queue->rcv_hash, cmd);
1031         queue->offset = 0;
1032         queue->left = NVME_TCP_DIGEST_LENGTH;
1033         queue->rcv_state = NVMET_TCP_RECV_DDGST;
1034 }
1035
1036 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1037 {
1038         struct nvmet_tcp_cmd  *cmd = queue->cmd;
1039         int ret;
1040
1041         while (msg_data_left(&cmd->recv_msg)) {
1042                 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1043                         cmd->recv_msg.msg_flags);
1044                 if (ret <= 0)
1045                         return ret;
1046
1047                 cmd->pdu_recv += ret;
1048                 cmd->rbytes_done += ret;
1049         }
1050
1051         nvmet_tcp_unmap_pdu_iovec(cmd);
1052
1053         if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1054             cmd->rbytes_done == cmd->req.transfer_len) {
1055                 if (queue->data_digest) {
1056                         nvmet_tcp_prep_recv_ddgst(cmd);
1057                         return 0;
1058                 }
1059                 cmd->req.execute(&cmd->req);
1060         }
1061
1062         nvmet_prepare_receive_pdu(queue);
1063         return 0;
1064 }
1065
1066 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1067 {
1068         struct nvmet_tcp_cmd *cmd = queue->cmd;
1069         int ret;
1070         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1071         struct kvec iov = {
1072                 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1073                 .iov_len = queue->left
1074         };
1075
1076         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1077                         iov.iov_len, msg.msg_flags);
1078         if (unlikely(ret < 0))
1079                 return ret;
1080
1081         queue->offset += ret;
1082         queue->left -= ret;
1083         if (queue->left)
1084                 return -EAGAIN;
1085
1086         if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1087                 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1088                         queue->idx, cmd->req.cmd->common.command_id,
1089                         queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1090                         le32_to_cpu(cmd->exp_ddgst));
1091                 nvmet_tcp_finish_cmd(cmd);
1092                 nvmet_tcp_fatal_error(queue);
1093                 ret = -EPROTO;
1094                 goto out;
1095         }
1096
1097         if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1098             cmd->rbytes_done == cmd->req.transfer_len)
1099                 cmd->req.execute(&cmd->req);
1100         ret = 0;
1101 out:
1102         nvmet_prepare_receive_pdu(queue);
1103         return ret;
1104 }
1105
1106 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1107 {
1108         int result = 0;
1109
1110         if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1111                 return 0;
1112
1113         if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1114                 result = nvmet_tcp_try_recv_pdu(queue);
1115                 if (result != 0)
1116                         goto done_recv;
1117         }
1118
1119         if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1120                 result = nvmet_tcp_try_recv_data(queue);
1121                 if (result != 0)
1122                         goto done_recv;
1123         }
1124
1125         if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1126                 result = nvmet_tcp_try_recv_ddgst(queue);
1127                 if (result != 0)
1128                         goto done_recv;
1129         }
1130
1131 done_recv:
1132         if (result < 0) {
1133                 if (result == -EAGAIN)
1134                         return 0;
1135                 return result;
1136         }
1137         return 1;
1138 }
1139
1140 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1141                 int budget, int *recvs)
1142 {
1143         int i, ret = 0;
1144
1145         for (i = 0; i < budget; i++) {
1146                 ret = nvmet_tcp_try_recv_one(queue);
1147                 if (ret <= 0)
1148                         break;
1149                 (*recvs)++;
1150         }
1151
1152         return ret;
1153 }
1154
1155 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1156 {
1157         spin_lock(&queue->state_lock);
1158         if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1159                 queue->state = NVMET_TCP_Q_DISCONNECTING;
1160                 schedule_work(&queue->release_work);
1161         }
1162         spin_unlock(&queue->state_lock);
1163 }
1164
1165 static void nvmet_tcp_io_work(struct work_struct *w)
1166 {
1167         struct nvmet_tcp_queue *queue =
1168                 container_of(w, struct nvmet_tcp_queue, io_work);
1169         bool pending;
1170         int ret, ops = 0;
1171
1172         do {
1173                 pending = false;
1174
1175                 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1176                 if (ret > 0) {
1177                         pending = true;
1178                 } else if (ret < 0) {
1179                         if (ret == -EPIPE || ret == -ECONNRESET)
1180                                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1181                         else
1182                                 nvmet_tcp_fatal_error(queue);
1183                         return;
1184                 }
1185
1186                 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1187                 if (ret > 0) {
1188                         /* transmitted message/data */
1189                         pending = true;
1190                 } else if (ret < 0) {
1191                         if (ret == -EPIPE || ret == -ECONNRESET)
1192                                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1193                         else
1194                                 nvmet_tcp_fatal_error(queue);
1195                         return;
1196                 }
1197
1198         } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1199
1200         /*
1201          * We exahusted our budget, requeue our selves
1202          */
1203         if (pending)
1204                 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1205 }
1206
1207 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1208                 struct nvmet_tcp_cmd *c)
1209 {
1210         u8 hdgst = nvmet_tcp_hdgst_len(queue);
1211
1212         c->queue = queue;
1213         c->req.port = queue->port->nport;
1214
1215         c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1216                         sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1217         if (!c->cmd_pdu)
1218                 return -ENOMEM;
1219         c->req.cmd = &c->cmd_pdu->cmd;
1220
1221         c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1222                         sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1223         if (!c->rsp_pdu)
1224                 goto out_free_cmd;
1225         c->req.cqe = &c->rsp_pdu->cqe;
1226
1227         c->data_pdu = page_frag_alloc(&queue->pf_cache,
1228                         sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1229         if (!c->data_pdu)
1230                 goto out_free_rsp;
1231
1232         c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1233                         sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1234         if (!c->r2t_pdu)
1235                 goto out_free_data;
1236
1237         c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1238
1239         list_add_tail(&c->entry, &queue->free_list);
1240
1241         return 0;
1242 out_free_data:
1243         page_frag_free(c->data_pdu);
1244 out_free_rsp:
1245         page_frag_free(c->rsp_pdu);
1246 out_free_cmd:
1247         page_frag_free(c->cmd_pdu);
1248         return -ENOMEM;
1249 }
1250
1251 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1252 {
1253         page_frag_free(c->r2t_pdu);
1254         page_frag_free(c->data_pdu);
1255         page_frag_free(c->rsp_pdu);
1256         page_frag_free(c->cmd_pdu);
1257 }
1258
1259 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1260 {
1261         struct nvmet_tcp_cmd *cmds;
1262         int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1263
1264         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1265         if (!cmds)
1266                 goto out;
1267
1268         for (i = 0; i < nr_cmds; i++) {
1269                 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1270                 if (ret)
1271                         goto out_free;
1272         }
1273
1274         queue->cmds = cmds;
1275
1276         return 0;
1277 out_free:
1278         while (--i >= 0)
1279                 nvmet_tcp_free_cmd(cmds + i);
1280         kfree(cmds);
1281 out:
1282         return ret;
1283 }
1284
1285 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1286 {
1287         struct nvmet_tcp_cmd *cmds = queue->cmds;
1288         int i;
1289
1290         for (i = 0; i < queue->nr_cmds; i++)
1291                 nvmet_tcp_free_cmd(cmds + i);
1292
1293         nvmet_tcp_free_cmd(&queue->connect);
1294         kfree(cmds);
1295 }
1296
1297 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1298 {
1299         struct socket *sock = queue->sock;
1300
1301         write_lock_bh(&sock->sk->sk_callback_lock);
1302         sock->sk->sk_data_ready =  queue->data_ready;
1303         sock->sk->sk_state_change = queue->state_change;
1304         sock->sk->sk_write_space = queue->write_space;
1305         sock->sk->sk_user_data = NULL;
1306         write_unlock_bh(&sock->sk->sk_callback_lock);
1307 }
1308
1309 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1310 {
1311         nvmet_req_uninit(&cmd->req);
1312         nvmet_tcp_unmap_pdu_iovec(cmd);
1313         kfree(cmd->iov);
1314         sgl_free(cmd->req.sg);
1315 }
1316
1317 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1318 {
1319         struct nvmet_tcp_cmd *cmd = queue->cmds;
1320         int i;
1321
1322         for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1323                 if (nvmet_tcp_need_data_in(cmd))
1324                         nvmet_tcp_finish_cmd(cmd);
1325         }
1326
1327         if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1328                 /* failed in connect */
1329                 nvmet_tcp_finish_cmd(&queue->connect);
1330         }
1331 }
1332
1333 static void nvmet_tcp_release_queue_work(struct work_struct *w)
1334 {
1335         struct nvmet_tcp_queue *queue =
1336                 container_of(w, struct nvmet_tcp_queue, release_work);
1337
1338         mutex_lock(&nvmet_tcp_queue_mutex);
1339         list_del_init(&queue->queue_list);
1340         mutex_unlock(&nvmet_tcp_queue_mutex);
1341
1342         nvmet_tcp_restore_socket_callbacks(queue);
1343         flush_work(&queue->io_work);
1344
1345         nvmet_tcp_uninit_data_in_cmds(queue);
1346         nvmet_sq_destroy(&queue->nvme_sq);
1347         cancel_work_sync(&queue->io_work);
1348         sock_release(queue->sock);
1349         nvmet_tcp_free_cmds(queue);
1350         if (queue->hdr_digest || queue->data_digest)
1351                 nvmet_tcp_free_crypto(queue);
1352         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1353
1354         kfree(queue);
1355 }
1356
1357 static void nvmet_tcp_data_ready(struct sock *sk)
1358 {
1359         struct nvmet_tcp_queue *queue;
1360
1361         read_lock_bh(&sk->sk_callback_lock);
1362         queue = sk->sk_user_data;
1363         if (likely(queue))
1364                 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1365         read_unlock_bh(&sk->sk_callback_lock);
1366 }
1367
1368 static void nvmet_tcp_write_space(struct sock *sk)
1369 {
1370         struct nvmet_tcp_queue *queue;
1371
1372         read_lock_bh(&sk->sk_callback_lock);
1373         queue = sk->sk_user_data;
1374         if (unlikely(!queue))
1375                 goto out;
1376
1377         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1378                 queue->write_space(sk);
1379                 goto out;
1380         }
1381
1382         if (sk_stream_is_writeable(sk)) {
1383                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1384                 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1385         }
1386 out:
1387         read_unlock_bh(&sk->sk_callback_lock);
1388 }
1389
1390 static void nvmet_tcp_state_change(struct sock *sk)
1391 {
1392         struct nvmet_tcp_queue *queue;
1393
1394         write_lock_bh(&sk->sk_callback_lock);
1395         queue = sk->sk_user_data;
1396         if (!queue)
1397                 goto done;
1398
1399         switch (sk->sk_state) {
1400         case TCP_FIN_WAIT1:
1401         case TCP_CLOSE_WAIT:
1402         case TCP_CLOSE:
1403                 /* FALLTHRU */
1404                 sk->sk_user_data = NULL;
1405                 nvmet_tcp_schedule_release_queue(queue);
1406                 break;
1407         default:
1408                 pr_warn("queue %d unhandled state %d\n",
1409                         queue->idx, sk->sk_state);
1410         }
1411 done:
1412         write_unlock_bh(&sk->sk_callback_lock);
1413 }
1414
1415 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1416 {
1417         struct socket *sock = queue->sock;
1418         struct inet_sock *inet = inet_sk(sock->sk);
1419         struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1420         int ret;
1421
1422         ret = kernel_getsockname(sock,
1423                 (struct sockaddr *)&queue->sockaddr);
1424         if (ret < 0)
1425                 return ret;
1426
1427         ret = kernel_getpeername(sock,
1428                 (struct sockaddr *)&queue->sockaddr_peer);
1429         if (ret < 0)
1430                 return ret;
1431
1432         /*
1433          * Cleanup whatever is sitting in the TCP transmit queue on socket
1434          * close. This is done to prevent stale data from being sent should
1435          * the network connection be restored before TCP times out.
1436          */
1437         ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
1438                         (char *)&sol, sizeof(sol));
1439         if (ret)
1440                 return ret;
1441
1442         /* Set socket type of service */
1443         if (inet->rcv_tos > 0) {
1444                 int tos = inet->rcv_tos;
1445
1446                 ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
1447                                 (char *)&tos, sizeof(tos));
1448                 if (ret)
1449                         return ret;
1450         }
1451
1452         write_lock_bh(&sock->sk->sk_callback_lock);
1453         sock->sk->sk_user_data = queue;
1454         queue->data_ready = sock->sk->sk_data_ready;
1455         sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1456         queue->state_change = sock->sk->sk_state_change;
1457         sock->sk->sk_state_change = nvmet_tcp_state_change;
1458         queue->write_space = sock->sk->sk_write_space;
1459         sock->sk->sk_write_space = nvmet_tcp_write_space;
1460         write_unlock_bh(&sock->sk->sk_callback_lock);
1461
1462         return 0;
1463 }
1464
1465 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1466                 struct socket *newsock)
1467 {
1468         struct nvmet_tcp_queue *queue;
1469         int ret;
1470
1471         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1472         if (!queue)
1473                 return -ENOMEM;
1474
1475         INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1476         INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1477         queue->sock = newsock;
1478         queue->port = port;
1479         queue->nr_cmds = 0;
1480         spin_lock_init(&queue->state_lock);
1481         queue->state = NVMET_TCP_Q_CONNECTING;
1482         INIT_LIST_HEAD(&queue->free_list);
1483         init_llist_head(&queue->resp_list);
1484         INIT_LIST_HEAD(&queue->resp_send_list);
1485
1486         queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1487         if (queue->idx < 0) {
1488                 ret = queue->idx;
1489                 goto out_free_queue;
1490         }
1491
1492         ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1493         if (ret)
1494                 goto out_ida_remove;
1495
1496         ret = nvmet_sq_init(&queue->nvme_sq);
1497         if (ret)
1498                 goto out_free_connect;
1499
1500         port->last_cpu = cpumask_next_wrap(port->last_cpu,
1501                                 cpu_online_mask, -1, false);
1502         queue->cpu = port->last_cpu;
1503         nvmet_prepare_receive_pdu(queue);
1504
1505         mutex_lock(&nvmet_tcp_queue_mutex);
1506         list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1507         mutex_unlock(&nvmet_tcp_queue_mutex);
1508
1509         ret = nvmet_tcp_set_queue_sock(queue);
1510         if (ret)
1511                 goto out_destroy_sq;
1512
1513         queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1514
1515         return 0;
1516 out_destroy_sq:
1517         mutex_lock(&nvmet_tcp_queue_mutex);
1518         list_del_init(&queue->queue_list);
1519         mutex_unlock(&nvmet_tcp_queue_mutex);
1520         nvmet_sq_destroy(&queue->nvme_sq);
1521 out_free_connect:
1522         nvmet_tcp_free_cmd(&queue->connect);
1523 out_ida_remove:
1524         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1525 out_free_queue:
1526         kfree(queue);
1527         return ret;
1528 }
1529
1530 static void nvmet_tcp_accept_work(struct work_struct *w)
1531 {
1532         struct nvmet_tcp_port *port =
1533                 container_of(w, struct nvmet_tcp_port, accept_work);
1534         struct socket *newsock;
1535         int ret;
1536
1537         while (true) {
1538                 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1539                 if (ret < 0) {
1540                         if (ret != -EAGAIN)
1541                                 pr_warn("failed to accept err=%d\n", ret);
1542                         return;
1543                 }
1544                 ret = nvmet_tcp_alloc_queue(port, newsock);
1545                 if (ret) {
1546                         pr_err("failed to allocate queue\n");
1547                         sock_release(newsock);
1548                 }
1549         }
1550 }
1551
1552 static void nvmet_tcp_listen_data_ready(struct sock *sk)
1553 {
1554         struct nvmet_tcp_port *port;
1555
1556         read_lock_bh(&sk->sk_callback_lock);
1557         port = sk->sk_user_data;
1558         if (!port)
1559                 goto out;
1560
1561         if (sk->sk_state == TCP_LISTEN)
1562                 schedule_work(&port->accept_work);
1563 out:
1564         read_unlock_bh(&sk->sk_callback_lock);
1565 }
1566
1567 static int nvmet_tcp_add_port(struct nvmet_port *nport)
1568 {
1569         struct nvmet_tcp_port *port;
1570         __kernel_sa_family_t af;
1571         int opt, ret;
1572
1573         port = kzalloc(sizeof(*port), GFP_KERNEL);
1574         if (!port)
1575                 return -ENOMEM;
1576
1577         switch (nport->disc_addr.adrfam) {
1578         case NVMF_ADDR_FAMILY_IP4:
1579                 af = AF_INET;
1580                 break;
1581         case NVMF_ADDR_FAMILY_IP6:
1582                 af = AF_INET6;
1583                 break;
1584         default:
1585                 pr_err("address family %d not supported\n",
1586                                 nport->disc_addr.adrfam);
1587                 ret = -EINVAL;
1588                 goto err_port;
1589         }
1590
1591         ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1592                         nport->disc_addr.trsvcid, &port->addr);
1593         if (ret) {
1594                 pr_err("malformed ip/port passed: %s:%s\n",
1595                         nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1596                 goto err_port;
1597         }
1598
1599         port->nport = nport;
1600         port->last_cpu = -1;
1601         INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1602         if (port->nport->inline_data_size < 0)
1603                 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1604
1605         ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1606                                 IPPROTO_TCP, &port->sock);
1607         if (ret) {
1608                 pr_err("failed to create a socket\n");
1609                 goto err_port;
1610         }
1611
1612         port->sock->sk->sk_user_data = port;
1613         port->data_ready = port->sock->sk->sk_data_ready;
1614         port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1615
1616         opt = 1;
1617         ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
1618                         TCP_NODELAY, (char *)&opt, sizeof(opt));
1619         if (ret) {
1620                 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
1621                 goto err_sock;
1622         }
1623
1624         ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
1625                         (char *)&opt, sizeof(opt));
1626         if (ret) {
1627                 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
1628                 goto err_sock;
1629         }
1630
1631         ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1632                         sizeof(port->addr));
1633         if (ret) {
1634                 pr_err("failed to bind port socket %d\n", ret);
1635                 goto err_sock;
1636         }
1637
1638         ret = kernel_listen(port->sock, 128);
1639         if (ret) {
1640                 pr_err("failed to listen %d on port sock\n", ret);
1641                 goto err_sock;
1642         }
1643
1644         nport->priv = port;
1645         pr_info("enabling port %d (%pISpc)\n",
1646                 le16_to_cpu(nport->disc_addr.portid), &port->addr);
1647
1648         return 0;
1649
1650 err_sock:
1651         sock_release(port->sock);
1652 err_port:
1653         kfree(port);
1654         return ret;
1655 }
1656
1657 static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1658 {
1659         struct nvmet_tcp_port *port = nport->priv;
1660
1661         write_lock_bh(&port->sock->sk->sk_callback_lock);
1662         port->sock->sk->sk_data_ready = port->data_ready;
1663         port->sock->sk->sk_user_data = NULL;
1664         write_unlock_bh(&port->sock->sk->sk_callback_lock);
1665         cancel_work_sync(&port->accept_work);
1666
1667         sock_release(port->sock);
1668         kfree(port);
1669 }
1670
1671 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1672 {
1673         struct nvmet_tcp_queue *queue;
1674
1675         mutex_lock(&nvmet_tcp_queue_mutex);
1676         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1677                 if (queue->nvme_sq.ctrl == ctrl)
1678                         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1679         mutex_unlock(&nvmet_tcp_queue_mutex);
1680 }
1681
1682 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1683 {
1684         struct nvmet_tcp_queue *queue =
1685                 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1686
1687         if (sq->qid == 0) {
1688                 /* Let inflight controller teardown complete */
1689                 flush_scheduled_work();
1690         }
1691
1692         queue->nr_cmds = sq->size * 2;
1693         if (nvmet_tcp_alloc_cmds(queue))
1694                 return NVME_SC_INTERNAL;
1695         return 0;
1696 }
1697
1698 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1699                 struct nvmet_port *nport, char *traddr)
1700 {
1701         struct nvmet_tcp_port *port = nport->priv;
1702
1703         if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1704                 struct nvmet_tcp_cmd *cmd =
1705                         container_of(req, struct nvmet_tcp_cmd, req);
1706                 struct nvmet_tcp_queue *queue = cmd->queue;
1707
1708                 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1709         } else {
1710                 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1711         }
1712 }
1713
1714 static struct nvmet_fabrics_ops nvmet_tcp_ops = {
1715         .owner                  = THIS_MODULE,
1716         .type                   = NVMF_TRTYPE_TCP,
1717         .msdbd                  = 1,
1718         .has_keyed_sgls         = 0,
1719         .add_port               = nvmet_tcp_add_port,
1720         .remove_port            = nvmet_tcp_remove_port,
1721         .queue_response         = nvmet_tcp_queue_response,
1722         .delete_ctrl            = nvmet_tcp_delete_ctrl,
1723         .install_queue          = nvmet_tcp_install_queue,
1724         .disc_traddr            = nvmet_tcp_disc_port_addr,
1725 };
1726
1727 static int __init nvmet_tcp_init(void)
1728 {
1729         int ret;
1730
1731         nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1732         if (!nvmet_tcp_wq)
1733                 return -ENOMEM;
1734
1735         ret = nvmet_register_transport(&nvmet_tcp_ops);
1736         if (ret)
1737                 goto err;
1738
1739         return 0;
1740 err:
1741         destroy_workqueue(nvmet_tcp_wq);
1742         return ret;
1743 }
1744
1745 static void __exit nvmet_tcp_exit(void)
1746 {
1747         struct nvmet_tcp_queue *queue;
1748
1749         nvmet_unregister_transport(&nvmet_tcp_ops);
1750
1751         flush_scheduled_work();
1752         mutex_lock(&nvmet_tcp_queue_mutex);
1753         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1754                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1755         mutex_unlock(&nvmet_tcp_queue_mutex);
1756         flush_scheduled_work();
1757
1758         destroy_workqueue(nvmet_tcp_wq);
1759 }
1760
1761 module_init(nvmet_tcp_init);
1762 module_exit(nvmet_tcp_exit);
1763
1764 MODULE_LICENSE("GPL v2");
1765 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */