net/smc: multiple link support for rmb buffer registration
[linux-2.6-microblaze.git] / net / smc / af_smc.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rcupdate_wait.h>
29
30 #include <net/sock.h>
31 #include <net/tcp.h>
32 #include <net/smc.h>
33 #include <asm/ioctls.h>
34
35 #include <net/net_namespace.h>
36 #include <net/netns/generic.h>
37 #include "smc_netns.h"
38
39 #include "smc.h"
40 #include "smc_clc.h"
41 #include "smc_llc.h"
42 #include "smc_cdc.h"
43 #include "smc_core.h"
44 #include "smc_ib.h"
45 #include "smc_ism.h"
46 #include "smc_pnet.h"
47 #include "smc_tx.h"
48 #include "smc_rx.h"
49 #include "smc_close.h"
50
51 static DEFINE_MUTEX(smc_server_lgr_pending);    /* serialize link group
52                                                  * creation on server
53                                                  */
54 static DEFINE_MUTEX(smc_client_lgr_pending);    /* serialize link group
55                                                  * creation on client
56                                                  */
57
58 static void smc_tcp_listen_work(struct work_struct *);
59 static void smc_connect_work(struct work_struct *);
60
61 static void smc_set_keepalive(struct sock *sk, int val)
62 {
63         struct smc_sock *smc = smc_sk(sk);
64
65         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
66 }
67
68 static struct smc_hashinfo smc_v4_hashinfo = {
69         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
70 };
71
72 static struct smc_hashinfo smc_v6_hashinfo = {
73         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
74 };
75
76 int smc_hash_sk(struct sock *sk)
77 {
78         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
79         struct hlist_head *head;
80
81         head = &h->ht;
82
83         write_lock_bh(&h->lock);
84         sk_add_node(sk, head);
85         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
86         write_unlock_bh(&h->lock);
87
88         return 0;
89 }
90 EXPORT_SYMBOL_GPL(smc_hash_sk);
91
92 void smc_unhash_sk(struct sock *sk)
93 {
94         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
95
96         write_lock_bh(&h->lock);
97         if (sk_del_node_init(sk))
98                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
99         write_unlock_bh(&h->lock);
100 }
101 EXPORT_SYMBOL_GPL(smc_unhash_sk);
102
103 struct proto smc_proto = {
104         .name           = "SMC",
105         .owner          = THIS_MODULE,
106         .keepalive      = smc_set_keepalive,
107         .hash           = smc_hash_sk,
108         .unhash         = smc_unhash_sk,
109         .obj_size       = sizeof(struct smc_sock),
110         .h.smc_hash     = &smc_v4_hashinfo,
111         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
112 };
113 EXPORT_SYMBOL_GPL(smc_proto);
114
115 struct proto smc_proto6 = {
116         .name           = "SMC6",
117         .owner          = THIS_MODULE,
118         .keepalive      = smc_set_keepalive,
119         .hash           = smc_hash_sk,
120         .unhash         = smc_unhash_sk,
121         .obj_size       = sizeof(struct smc_sock),
122         .h.smc_hash     = &smc_v6_hashinfo,
123         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
124 };
125 EXPORT_SYMBOL_GPL(smc_proto6);
126
127 static void smc_restore_fallback_changes(struct smc_sock *smc)
128 {
129         smc->clcsock->file->private_data = smc->sk.sk_socket;
130         smc->clcsock->file = NULL;
131 }
132
133 static int __smc_release(struct smc_sock *smc)
134 {
135         struct sock *sk = &smc->sk;
136         int rc = 0;
137
138         if (!smc->use_fallback) {
139                 rc = smc_close_active(smc);
140                 sock_set_flag(sk, SOCK_DEAD);
141                 sk->sk_shutdown |= SHUTDOWN_MASK;
142         } else {
143                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
144                         sock_put(sk); /* passive closing */
145                 if (sk->sk_state == SMC_LISTEN) {
146                         /* wake up clcsock accept */
147                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
148                 }
149                 sk->sk_state = SMC_CLOSED;
150                 sk->sk_state_change(sk);
151                 smc_restore_fallback_changes(smc);
152         }
153
154         sk->sk_prot->unhash(sk);
155
156         if (sk->sk_state == SMC_CLOSED) {
157                 if (smc->clcsock) {
158                         release_sock(sk);
159                         smc_clcsock_release(smc);
160                         lock_sock(sk);
161                 }
162                 if (!smc->use_fallback)
163                         smc_conn_free(&smc->conn);
164         }
165
166         return rc;
167 }
168
169 static int smc_release(struct socket *sock)
170 {
171         struct sock *sk = sock->sk;
172         struct smc_sock *smc;
173         int rc = 0;
174
175         if (!sk)
176                 goto out;
177
178         sock_hold(sk); /* sock_put below */
179         smc = smc_sk(sk);
180
181         /* cleanup for a dangling non-blocking connect */
182         if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
183                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
184         flush_work(&smc->connect_work);
185
186         if (sk->sk_state == SMC_LISTEN)
187                 /* smc_close_non_accepted() is called and acquires
188                  * sock lock for child sockets again
189                  */
190                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
191         else
192                 lock_sock(sk);
193
194         rc = __smc_release(smc);
195
196         /* detach socket */
197         sock_orphan(sk);
198         sock->sk = NULL;
199         release_sock(sk);
200
201         sock_put(sk); /* sock_hold above */
202         sock_put(sk); /* final sock_put */
203 out:
204         return rc;
205 }
206
207 static void smc_destruct(struct sock *sk)
208 {
209         if (sk->sk_state != SMC_CLOSED)
210                 return;
211         if (!sock_flag(sk, SOCK_DEAD))
212                 return;
213
214         sk_refcnt_debug_dec(sk);
215 }
216
217 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
218                                    int protocol)
219 {
220         struct smc_sock *smc;
221         struct proto *prot;
222         struct sock *sk;
223
224         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
225         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
226         if (!sk)
227                 return NULL;
228
229         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
230         sk->sk_state = SMC_INIT;
231         sk->sk_destruct = smc_destruct;
232         sk->sk_protocol = protocol;
233         smc = smc_sk(sk);
234         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
235         INIT_WORK(&smc->connect_work, smc_connect_work);
236         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
237         INIT_LIST_HEAD(&smc->accept_q);
238         spin_lock_init(&smc->accept_q_lock);
239         spin_lock_init(&smc->conn.send_lock);
240         sk->sk_prot->hash(sk);
241         sk_refcnt_debug_inc(sk);
242         mutex_init(&smc->clcsock_release_lock);
243
244         return sk;
245 }
246
247 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
248                     int addr_len)
249 {
250         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
251         struct sock *sk = sock->sk;
252         struct smc_sock *smc;
253         int rc;
254
255         smc = smc_sk(sk);
256
257         /* replicate tests from inet_bind(), to be safe wrt. future changes */
258         rc = -EINVAL;
259         if (addr_len < sizeof(struct sockaddr_in))
260                 goto out;
261
262         rc = -EAFNOSUPPORT;
263         if (addr->sin_family != AF_INET &&
264             addr->sin_family != AF_INET6 &&
265             addr->sin_family != AF_UNSPEC)
266                 goto out;
267         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
268         if (addr->sin_family == AF_UNSPEC &&
269             addr->sin_addr.s_addr != htonl(INADDR_ANY))
270                 goto out;
271
272         lock_sock(sk);
273
274         /* Check if socket is already active */
275         rc = -EINVAL;
276         if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
277                 goto out_rel;
278
279         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
280         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
281
282 out_rel:
283         release_sock(sk);
284 out:
285         return rc;
286 }
287
288 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
289                                    unsigned long mask)
290 {
291         /* options we don't get control via setsockopt for */
292         nsk->sk_type = osk->sk_type;
293         nsk->sk_sndbuf = osk->sk_sndbuf;
294         nsk->sk_rcvbuf = osk->sk_rcvbuf;
295         nsk->sk_sndtimeo = osk->sk_sndtimeo;
296         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
297         nsk->sk_mark = osk->sk_mark;
298         nsk->sk_priority = osk->sk_priority;
299         nsk->sk_rcvlowat = osk->sk_rcvlowat;
300         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
301         nsk->sk_err = osk->sk_err;
302
303         nsk->sk_flags &= ~mask;
304         nsk->sk_flags |= osk->sk_flags & mask;
305 }
306
307 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
308                              (1UL << SOCK_KEEPOPEN) | \
309                              (1UL << SOCK_LINGER) | \
310                              (1UL << SOCK_BROADCAST) | \
311                              (1UL << SOCK_TIMESTAMP) | \
312                              (1UL << SOCK_DBG) | \
313                              (1UL << SOCK_RCVTSTAMP) | \
314                              (1UL << SOCK_RCVTSTAMPNS) | \
315                              (1UL << SOCK_LOCALROUTE) | \
316                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
317                              (1UL << SOCK_RXQ_OVFL) | \
318                              (1UL << SOCK_WIFI_STATUS) | \
319                              (1UL << SOCK_NOFCS) | \
320                              (1UL << SOCK_FILTER_LOCKED) | \
321                              (1UL << SOCK_TSTAMP_NEW))
322 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
323  * clc socket (since smc is not called for these options from net/core)
324  */
325 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
326 {
327         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
328 }
329
330 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
331                              (1UL << SOCK_KEEPOPEN) | \
332                              (1UL << SOCK_LINGER) | \
333                              (1UL << SOCK_DBG))
334 /* copy only settings and flags relevant for smc from clc to smc socket */
335 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
336 {
337         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
338 }
339
340 /* register the new rmb on all links */
341 static int smcr_lgr_reg_rmbs(struct smc_link *link,
342                              struct smc_buf_desc *rmb_desc)
343 {
344         struct smc_link_group *lgr = link->lgr;
345         int i, rc = 0;
346
347         for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
348                 if (lgr->lnk[i].state != SMC_LNK_ACTIVE)
349                         continue;
350                 rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
351                 if (rc)
352                         goto out;
353         }
354
355         /* exchange confirm_rkey msg with peer */
356         rc = smc_llc_do_confirm_rkey(link, rmb_desc);
357         if (rc) {
358                 rc = -EFAULT;
359                 goto out;
360         }
361         rmb_desc->is_conf_rkey = true;
362 out:
363         return rc;
364 }
365
366 static int smcr_clnt_conf_first_link(struct smc_sock *smc)
367 {
368         struct smc_link *link = smc->conn.lnk;
369         struct smc_llc_qentry *qentry;
370         int rc;
371
372         link->lgr->type = SMC_LGR_SINGLE;
373
374         /* receive CONFIRM LINK request from server over RoCE fabric */
375         qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
376                               SMC_LLC_CONFIRM_LINK);
377         if (!qentry) {
378                 struct smc_clc_msg_decline dclc;
379
380                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
381                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
382                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
383         }
384         rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
385         smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
386         if (rc)
387                 return SMC_CLC_DECL_RMBE_EC;
388
389         rc = smc_ib_modify_qp_rts(link);
390         if (rc)
391                 return SMC_CLC_DECL_ERR_RDYLNK;
392
393         smc_wr_remember_qp_attr(link);
394
395         if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
396                 return SMC_CLC_DECL_ERR_REGRMB;
397
398         /* confirm_rkey is implicit on 1st contact */
399         smc->conn.rmb_desc->is_conf_rkey = true;
400
401         /* send CONFIRM LINK response over RoCE fabric */
402         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
403         if (rc < 0)
404                 return SMC_CLC_DECL_TIMEOUT_CL;
405
406         smc_llc_link_active(link);
407
408         /* optional 2nd link, receive ADD LINK request from server */
409         qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
410                               SMC_LLC_ADD_LINK);
411         if (!qentry) {
412                 struct smc_clc_msg_decline dclc;
413
414                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
415                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
416                 if (rc == -EAGAIN)
417                         rc = 0; /* no DECLINE received, go with one link */
418                 return rc;
419         }
420         smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
421         /* tbd: call smc_llc_cli_add_link(link, qentry); */
422         return 0;
423 }
424
425 static void smcr_conn_save_peer_info(struct smc_sock *smc,
426                                      struct smc_clc_msg_accept_confirm *clc)
427 {
428         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
429
430         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
431         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
432         smc->conn.peer_rmbe_size = bufsize;
433         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
434         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
435 }
436
437 static void smcd_conn_save_peer_info(struct smc_sock *smc,
438                                      struct smc_clc_msg_accept_confirm *clc)
439 {
440         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
441
442         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
443         smc->conn.peer_token = clc->token;
444         /* msg header takes up space in the buffer */
445         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
446         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
447         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
448 }
449
450 static void smc_conn_save_peer_info(struct smc_sock *smc,
451                                     struct smc_clc_msg_accept_confirm *clc)
452 {
453         if (smc->conn.lgr->is_smcd)
454                 smcd_conn_save_peer_info(smc, clc);
455         else
456                 smcr_conn_save_peer_info(smc, clc);
457 }
458
459 static void smc_link_save_peer_info(struct smc_link *link,
460                                     struct smc_clc_msg_accept_confirm *clc)
461 {
462         link->peer_qpn = ntoh24(clc->qpn);
463         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
464         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
465         link->peer_psn = ntoh24(clc->psn);
466         link->peer_mtu = clc->qp_mtu;
467 }
468
469 static void smc_switch_to_fallback(struct smc_sock *smc)
470 {
471         smc->use_fallback = true;
472         if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
473                 smc->clcsock->file = smc->sk.sk_socket->file;
474                 smc->clcsock->file->private_data = smc->clcsock;
475                 smc->clcsock->wq.fasync_list =
476                         smc->sk.sk_socket->wq.fasync_list;
477         }
478 }
479
480 /* fall back during connect */
481 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
482 {
483         smc_switch_to_fallback(smc);
484         smc->fallback_rsn = reason_code;
485         smc_copy_sock_settings_to_clc(smc);
486         smc->connect_nonblock = 0;
487         if (smc->sk.sk_state == SMC_INIT)
488                 smc->sk.sk_state = SMC_ACTIVE;
489         return 0;
490 }
491
492 /* decline and fall back during connect */
493 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
494 {
495         int rc;
496
497         if (reason_code < 0) { /* error, fallback is not possible */
498                 if (smc->sk.sk_state == SMC_INIT)
499                         sock_put(&smc->sk); /* passive closing */
500                 return reason_code;
501         }
502         if (reason_code != SMC_CLC_DECL_PEERDECL) {
503                 rc = smc_clc_send_decline(smc, reason_code);
504                 if (rc < 0) {
505                         if (smc->sk.sk_state == SMC_INIT)
506                                 sock_put(&smc->sk); /* passive closing */
507                         return rc;
508                 }
509         }
510         return smc_connect_fallback(smc, reason_code);
511 }
512
513 /* abort connecting */
514 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
515                              int local_contact)
516 {
517         bool is_smcd = smc->conn.lgr->is_smcd;
518
519         if (local_contact == SMC_FIRST_CONTACT)
520                 smc_lgr_cleanup_early(&smc->conn);
521         else
522                 smc_conn_free(&smc->conn);
523         if (is_smcd)
524                 /* there is only one lgr role for SMC-D; use server lock */
525                 mutex_unlock(&smc_server_lgr_pending);
526         else
527                 mutex_unlock(&smc_client_lgr_pending);
528
529         smc->connect_nonblock = 0;
530         return reason_code;
531 }
532
533 /* check if there is a rdma device available for this connection. */
534 /* called for connect and listen */
535 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
536 {
537         /* PNET table look up: search active ib_device and port
538          * within same PNETID that also contains the ethernet device
539          * used for the internal TCP socket
540          */
541         smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
542         if (!ini->ib_dev)
543                 return SMC_CLC_DECL_NOSMCRDEV;
544         return 0;
545 }
546
547 /* check if there is an ISM device available for this connection. */
548 /* called for connect and listen */
549 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
550 {
551         /* Find ISM device with same PNETID as connecting interface  */
552         smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
553         if (!ini->ism_dev)
554                 return SMC_CLC_DECL_NOSMCDDEV;
555         return 0;
556 }
557
558 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
559 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
560                                       struct smc_init_info *ini)
561 {
562         if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
563                 return SMC_CLC_DECL_ISMVLANERR;
564         return 0;
565 }
566
567 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
568  * used, the VLAN ID will be registered again during the connection setup.
569  */
570 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
571                                         struct smc_init_info *ini)
572 {
573         if (!is_smcd)
574                 return 0;
575         if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
576                 return SMC_CLC_DECL_CNFERR;
577         return 0;
578 }
579
580 /* CLC handshake during connect */
581 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
582                            struct smc_clc_msg_accept_confirm *aclc,
583                            struct smc_init_info *ini)
584 {
585         int rc = 0;
586
587         /* do inband token exchange */
588         rc = smc_clc_send_proposal(smc, smc_type, ini);
589         if (rc)
590                 return rc;
591         /* receive SMC Accept CLC message */
592         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
593                                 CLC_WAIT_TIME);
594 }
595
596 /* setup for RDMA connection of client */
597 static int smc_connect_rdma(struct smc_sock *smc,
598                             struct smc_clc_msg_accept_confirm *aclc,
599                             struct smc_init_info *ini)
600 {
601         int i, reason_code = 0;
602         struct smc_link *link;
603
604         ini->is_smcd = false;
605         ini->ib_lcl = &aclc->lcl;
606         ini->ib_clcqpn = ntoh24(aclc->qpn);
607         ini->srv_first_contact = aclc->hdr.flag;
608
609         mutex_lock(&smc_client_lgr_pending);
610         reason_code = smc_conn_create(smc, ini);
611         if (reason_code) {
612                 mutex_unlock(&smc_client_lgr_pending);
613                 return reason_code;
614         }
615
616         smc_conn_save_peer_info(smc, aclc);
617
618         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
619                 link = smc->conn.lnk;
620         } else {
621                 /* set link that was assigned by server */
622                 link = NULL;
623                 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
624                         struct smc_link *l = &smc->conn.lgr->lnk[i];
625
626                         if (l->peer_qpn == ntoh24(aclc->qpn)) {
627                                 link = l;
628                                 break;
629                         }
630                 }
631                 if (!link)
632                         return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK,
633                                                  ini->cln_first_contact);
634                 smc->conn.lnk = link;
635         }
636
637         /* create send buffer and rmb */
638         if (smc_buf_create(smc, false))
639                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
640                                          ini->cln_first_contact);
641
642         if (ini->cln_first_contact == SMC_FIRST_CONTACT)
643                 smc_link_save_peer_info(link, aclc);
644
645         if (smc_rmb_rtoken_handling(&smc->conn, link, aclc))
646                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
647                                          ini->cln_first_contact);
648
649         smc_close_init(smc);
650         smc_rx_init(smc);
651
652         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
653                 if (smc_ib_ready_link(link))
654                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
655                                                  ini->cln_first_contact);
656         } else {
657                 if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc))
658                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
659                                                  ini->cln_first_contact);
660         }
661         smc_rmb_sync_sg_for_device(&smc->conn);
662
663         reason_code = smc_clc_send_confirm(smc);
664         if (reason_code)
665                 return smc_connect_abort(smc, reason_code,
666                                          ini->cln_first_contact);
667
668         smc_tx_init(smc);
669
670         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
671                 /* QP confirmation over RoCE fabric */
672                 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
673                 reason_code = smcr_clnt_conf_first_link(smc);
674                 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
675                 if (reason_code)
676                         return smc_connect_abort(smc, reason_code,
677                                                  ini->cln_first_contact);
678         }
679         mutex_unlock(&smc_client_lgr_pending);
680
681         smc_copy_sock_settings_to_clc(smc);
682         smc->connect_nonblock = 0;
683         if (smc->sk.sk_state == SMC_INIT)
684                 smc->sk.sk_state = SMC_ACTIVE;
685
686         return 0;
687 }
688
689 /* setup for ISM connection of client */
690 static int smc_connect_ism(struct smc_sock *smc,
691                            struct smc_clc_msg_accept_confirm *aclc,
692                            struct smc_init_info *ini)
693 {
694         int rc = 0;
695
696         ini->is_smcd = true;
697         ini->ism_gid = aclc->gid;
698         ini->srv_first_contact = aclc->hdr.flag;
699
700         /* there is only one lgr role for SMC-D; use server lock */
701         mutex_lock(&smc_server_lgr_pending);
702         rc = smc_conn_create(smc, ini);
703         if (rc) {
704                 mutex_unlock(&smc_server_lgr_pending);
705                 return rc;
706         }
707
708         /* Create send and receive buffers */
709         if (smc_buf_create(smc, true))
710                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
711                                          ini->cln_first_contact);
712
713         smc_conn_save_peer_info(smc, aclc);
714         smc_close_init(smc);
715         smc_rx_init(smc);
716         smc_tx_init(smc);
717
718         rc = smc_clc_send_confirm(smc);
719         if (rc)
720                 return smc_connect_abort(smc, rc, ini->cln_first_contact);
721         mutex_unlock(&smc_server_lgr_pending);
722
723         smc_copy_sock_settings_to_clc(smc);
724         smc->connect_nonblock = 0;
725         if (smc->sk.sk_state == SMC_INIT)
726                 smc->sk.sk_state = SMC_ACTIVE;
727
728         return 0;
729 }
730
731 /* perform steps before actually connecting */
732 static int __smc_connect(struct smc_sock *smc)
733 {
734         bool ism_supported = false, rdma_supported = false;
735         struct smc_clc_msg_accept_confirm aclc;
736         struct smc_init_info ini = {0};
737         int smc_type;
738         int rc = 0;
739
740         if (smc->use_fallback)
741                 return smc_connect_fallback(smc, smc->fallback_rsn);
742
743         /* if peer has not signalled SMC-capability, fall back */
744         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
745                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
746
747         /* IPSec connections opt out of SMC-R optimizations */
748         if (using_ipsec(smc))
749                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
750
751         /* get vlan id from IP device */
752         if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
753                 return smc_connect_decline_fallback(smc,
754                                                     SMC_CLC_DECL_GETVLANERR);
755
756         /* check if there is an ism device available */
757         if (!smc_find_ism_device(smc, &ini) &&
758             !smc_connect_ism_vlan_setup(smc, &ini)) {
759                 /* ISM is supported for this connection */
760                 ism_supported = true;
761                 smc_type = SMC_TYPE_D;
762         }
763
764         /* check if there is a rdma device available */
765         if (!smc_find_rdma_device(smc, &ini)) {
766                 /* RDMA is supported for this connection */
767                 rdma_supported = true;
768                 if (ism_supported)
769                         smc_type = SMC_TYPE_B; /* both */
770                 else
771                         smc_type = SMC_TYPE_R; /* only RDMA */
772         }
773
774         /* if neither ISM nor RDMA are supported, fallback */
775         if (!rdma_supported && !ism_supported)
776                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
777
778         /* perform CLC handshake */
779         rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
780         if (rc) {
781                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
782                 return smc_connect_decline_fallback(smc, rc);
783         }
784
785         /* depending on previous steps, connect using rdma or ism */
786         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
787                 rc = smc_connect_rdma(smc, &aclc, &ini);
788         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
789                 rc = smc_connect_ism(smc, &aclc, &ini);
790         else
791                 rc = SMC_CLC_DECL_MODEUNSUPP;
792         if (rc) {
793                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
794                 return smc_connect_decline_fallback(smc, rc);
795         }
796
797         smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
798         return 0;
799 }
800
801 static void smc_connect_work(struct work_struct *work)
802 {
803         struct smc_sock *smc = container_of(work, struct smc_sock,
804                                             connect_work);
805         long timeo = smc->sk.sk_sndtimeo;
806         int rc = 0;
807
808         if (!timeo)
809                 timeo = MAX_SCHEDULE_TIMEOUT;
810         lock_sock(smc->clcsock->sk);
811         if (smc->clcsock->sk->sk_err) {
812                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
813         } else if ((1 << smc->clcsock->sk->sk_state) &
814                                         (TCPF_SYN_SENT | TCP_SYN_RECV)) {
815                 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
816                 if ((rc == -EPIPE) &&
817                     ((1 << smc->clcsock->sk->sk_state) &
818                                         (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
819                         rc = 0;
820         }
821         release_sock(smc->clcsock->sk);
822         lock_sock(&smc->sk);
823         if (rc != 0 || smc->sk.sk_err) {
824                 smc->sk.sk_state = SMC_CLOSED;
825                 if (rc == -EPIPE || rc == -EAGAIN)
826                         smc->sk.sk_err = EPIPE;
827                 else if (signal_pending(current))
828                         smc->sk.sk_err = -sock_intr_errno(timeo);
829                 sock_put(&smc->sk); /* passive closing */
830                 goto out;
831         }
832
833         rc = __smc_connect(smc);
834         if (rc < 0)
835                 smc->sk.sk_err = -rc;
836
837 out:
838         if (!sock_flag(&smc->sk, SOCK_DEAD)) {
839                 if (smc->sk.sk_err) {
840                         smc->sk.sk_state_change(&smc->sk);
841                 } else { /* allow polling before and after fallback decision */
842                         smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
843                         smc->sk.sk_write_space(&smc->sk);
844                 }
845         }
846         release_sock(&smc->sk);
847 }
848
849 static int smc_connect(struct socket *sock, struct sockaddr *addr,
850                        int alen, int flags)
851 {
852         struct sock *sk = sock->sk;
853         struct smc_sock *smc;
854         int rc = -EINVAL;
855
856         smc = smc_sk(sk);
857
858         /* separate smc parameter checking to be safe */
859         if (alen < sizeof(addr->sa_family))
860                 goto out_err;
861         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
862                 goto out_err;
863
864         lock_sock(sk);
865         switch (sk->sk_state) {
866         default:
867                 goto out;
868         case SMC_ACTIVE:
869                 rc = -EISCONN;
870                 goto out;
871         case SMC_INIT:
872                 rc = 0;
873                 break;
874         }
875
876         smc_copy_sock_settings_to_clc(smc);
877         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
878         if (smc->connect_nonblock) {
879                 rc = -EALREADY;
880                 goto out;
881         }
882         rc = kernel_connect(smc->clcsock, addr, alen, flags);
883         if (rc && rc != -EINPROGRESS)
884                 goto out;
885
886         sock_hold(&smc->sk); /* sock put in passive closing */
887         if (smc->use_fallback)
888                 goto out;
889         if (flags & O_NONBLOCK) {
890                 if (schedule_work(&smc->connect_work))
891                         smc->connect_nonblock = 1;
892                 rc = -EINPROGRESS;
893         } else {
894                 rc = __smc_connect(smc);
895                 if (rc < 0)
896                         goto out;
897                 else
898                         rc = 0; /* success cases including fallback */
899         }
900
901 out:
902         release_sock(sk);
903 out_err:
904         return rc;
905 }
906
907 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
908 {
909         struct socket *new_clcsock = NULL;
910         struct sock *lsk = &lsmc->sk;
911         struct sock *new_sk;
912         int rc = -EINVAL;
913
914         release_sock(lsk);
915         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
916         if (!new_sk) {
917                 rc = -ENOMEM;
918                 lsk->sk_err = ENOMEM;
919                 *new_smc = NULL;
920                 lock_sock(lsk);
921                 goto out;
922         }
923         *new_smc = smc_sk(new_sk);
924
925         mutex_lock(&lsmc->clcsock_release_lock);
926         if (lsmc->clcsock)
927                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
928         mutex_unlock(&lsmc->clcsock_release_lock);
929         lock_sock(lsk);
930         if  (rc < 0)
931                 lsk->sk_err = -rc;
932         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
933                 new_sk->sk_prot->unhash(new_sk);
934                 if (new_clcsock)
935                         sock_release(new_clcsock);
936                 new_sk->sk_state = SMC_CLOSED;
937                 sock_set_flag(new_sk, SOCK_DEAD);
938                 sock_put(new_sk); /* final */
939                 *new_smc = NULL;
940                 goto out;
941         }
942
943         (*new_smc)->clcsock = new_clcsock;
944 out:
945         return rc;
946 }
947
948 /* add a just created sock to the accept queue of the listen sock as
949  * candidate for a following socket accept call from user space
950  */
951 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
952 {
953         struct smc_sock *par = smc_sk(parent);
954
955         sock_hold(sk); /* sock_put in smc_accept_unlink () */
956         spin_lock(&par->accept_q_lock);
957         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
958         spin_unlock(&par->accept_q_lock);
959         sk_acceptq_added(parent);
960 }
961
962 /* remove a socket from the accept queue of its parental listening socket */
963 static void smc_accept_unlink(struct sock *sk)
964 {
965         struct smc_sock *par = smc_sk(sk)->listen_smc;
966
967         spin_lock(&par->accept_q_lock);
968         list_del_init(&smc_sk(sk)->accept_q);
969         spin_unlock(&par->accept_q_lock);
970         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
971         sock_put(sk); /* sock_hold in smc_accept_enqueue */
972 }
973
974 /* remove a sock from the accept queue to bind it to a new socket created
975  * for a socket accept call from user space
976  */
977 struct sock *smc_accept_dequeue(struct sock *parent,
978                                 struct socket *new_sock)
979 {
980         struct smc_sock *isk, *n;
981         struct sock *new_sk;
982
983         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
984                 new_sk = (struct sock *)isk;
985
986                 smc_accept_unlink(new_sk);
987                 if (new_sk->sk_state == SMC_CLOSED) {
988                         new_sk->sk_prot->unhash(new_sk);
989                         if (isk->clcsock) {
990                                 sock_release(isk->clcsock);
991                                 isk->clcsock = NULL;
992                         }
993                         sock_put(new_sk); /* final */
994                         continue;
995                 }
996                 if (new_sock) {
997                         sock_graft(new_sk, new_sock);
998                         if (isk->use_fallback) {
999                                 smc_sk(new_sk)->clcsock->file = new_sock->file;
1000                                 isk->clcsock->file->private_data = isk->clcsock;
1001                         }
1002                 }
1003                 return new_sk;
1004         }
1005         return NULL;
1006 }
1007
1008 /* clean up for a created but never accepted sock */
1009 void smc_close_non_accepted(struct sock *sk)
1010 {
1011         struct smc_sock *smc = smc_sk(sk);
1012
1013         sock_hold(sk); /* sock_put below */
1014         lock_sock(sk);
1015         if (!sk->sk_lingertime)
1016                 /* wait for peer closing */
1017                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
1018         __smc_release(smc);
1019         release_sock(sk);
1020         sock_put(sk); /* sock_hold above */
1021         sock_put(sk); /* final sock_put */
1022 }
1023
1024 static int smcr_serv_conf_first_link(struct smc_sock *smc)
1025 {
1026         struct smc_link *link = smc->conn.lnk;
1027         struct smc_llc_qentry *qentry;
1028         int rc;
1029
1030         link->lgr->type = SMC_LGR_SINGLE;
1031
1032         if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
1033                 return SMC_CLC_DECL_ERR_REGRMB;
1034
1035         /* send CONFIRM LINK request to client over the RoCE fabric */
1036         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1037         if (rc < 0)
1038                 return SMC_CLC_DECL_TIMEOUT_CL;
1039
1040         /* receive CONFIRM LINK response from client over the RoCE fabric */
1041         qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
1042                               SMC_LLC_CONFIRM_LINK);
1043         if (!qentry) {
1044                 struct smc_clc_msg_decline dclc;
1045
1046                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1047                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1048                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1049         }
1050         rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
1051         smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
1052         if (rc)
1053                 return SMC_CLC_DECL_RMBE_EC;
1054
1055         /* confirm_rkey is implicit on 1st contact */
1056         smc->conn.rmb_desc->is_conf_rkey = true;
1057
1058         smc_llc_link_active(link);
1059
1060         /* initial contact - try to establish second link */
1061         /* tbd: call smc_llc_srv_add_link(link); */
1062         return 0;
1063 }
1064
1065 /* listen worker: finish */
1066 static void smc_listen_out(struct smc_sock *new_smc)
1067 {
1068         struct smc_sock *lsmc = new_smc->listen_smc;
1069         struct sock *newsmcsk = &new_smc->sk;
1070
1071         if (lsmc->sk.sk_state == SMC_LISTEN) {
1072                 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1073                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1074                 release_sock(&lsmc->sk);
1075         } else { /* no longer listening */
1076                 smc_close_non_accepted(newsmcsk);
1077         }
1078
1079         /* Wake up accept */
1080         lsmc->sk.sk_data_ready(&lsmc->sk);
1081         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1082 }
1083
1084 /* listen worker: finish in state connected */
1085 static void smc_listen_out_connected(struct smc_sock *new_smc)
1086 {
1087         struct sock *newsmcsk = &new_smc->sk;
1088
1089         sk_refcnt_debug_inc(newsmcsk);
1090         if (newsmcsk->sk_state == SMC_INIT)
1091                 newsmcsk->sk_state = SMC_ACTIVE;
1092
1093         smc_listen_out(new_smc);
1094 }
1095
1096 /* listen worker: finish in error state */
1097 static void smc_listen_out_err(struct smc_sock *new_smc)
1098 {
1099         struct sock *newsmcsk = &new_smc->sk;
1100
1101         if (newsmcsk->sk_state == SMC_INIT)
1102                 sock_put(&new_smc->sk); /* passive closing */
1103         newsmcsk->sk_state = SMC_CLOSED;
1104
1105         smc_listen_out(new_smc);
1106 }
1107
1108 /* listen worker: decline and fall back if possible */
1109 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1110                                int local_contact)
1111 {
1112         /* RDMA setup failed, switch back to TCP */
1113         if (local_contact == SMC_FIRST_CONTACT)
1114                 smc_lgr_cleanup_early(&new_smc->conn);
1115         else
1116                 smc_conn_free(&new_smc->conn);
1117         if (reason_code < 0) { /* error, no fallback possible */
1118                 smc_listen_out_err(new_smc);
1119                 return;
1120         }
1121         smc_switch_to_fallback(new_smc);
1122         new_smc->fallback_rsn = reason_code;
1123         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1124                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1125                         smc_listen_out_err(new_smc);
1126                         return;
1127                 }
1128         }
1129         smc_listen_out_connected(new_smc);
1130 }
1131
1132 /* listen worker: check prefixes */
1133 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1134                                  struct smc_clc_msg_proposal *pclc)
1135 {
1136         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1137         struct socket *newclcsock = new_smc->clcsock;
1138
1139         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1140         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1141                 return SMC_CLC_DECL_DIFFPREFIX;
1142
1143         return 0;
1144 }
1145
1146 /* listen worker: initialize connection and buffers */
1147 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1148                                 struct smc_init_info *ini)
1149 {
1150         int rc;
1151
1152         /* allocate connection / link group */
1153         rc = smc_conn_create(new_smc, ini);
1154         if (rc)
1155                 return rc;
1156
1157         /* create send buffer and rmb */
1158         if (smc_buf_create(new_smc, false))
1159                 return SMC_CLC_DECL_MEM;
1160
1161         return 0;
1162 }
1163
1164 /* listen worker: initialize connection and buffers for SMC-D */
1165 static int smc_listen_ism_init(struct smc_sock *new_smc,
1166                                struct smc_clc_msg_proposal *pclc,
1167                                struct smc_init_info *ini)
1168 {
1169         struct smc_clc_msg_smcd *pclc_smcd;
1170         int rc;
1171
1172         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1173         ini->ism_gid = pclc_smcd->gid;
1174         rc = smc_conn_create(new_smc, ini);
1175         if (rc)
1176                 return rc;
1177
1178         /* Check if peer can be reached via ISM device */
1179         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1180                             new_smc->conn.lgr->vlan_id,
1181                             new_smc->conn.lgr->smcd)) {
1182                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1183                         smc_lgr_cleanup_early(&new_smc->conn);
1184                 else
1185                         smc_conn_free(&new_smc->conn);
1186                 return SMC_CLC_DECL_SMCDNOTALK;
1187         }
1188
1189         /* Create send and receive buffers */
1190         if (smc_buf_create(new_smc, true)) {
1191                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1192                         smc_lgr_cleanup_early(&new_smc->conn);
1193                 else
1194                         smc_conn_free(&new_smc->conn);
1195                 return SMC_CLC_DECL_MEM;
1196         }
1197
1198         return 0;
1199 }
1200
1201 /* listen worker: register buffers */
1202 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1203 {
1204         struct smc_connection *conn = &new_smc->conn;
1205
1206         if (local_contact != SMC_FIRST_CONTACT) {
1207                 if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
1208                         return SMC_CLC_DECL_ERR_REGRMB;
1209         }
1210         smc_rmb_sync_sg_for_device(&new_smc->conn);
1211
1212         return 0;
1213 }
1214
1215 /* listen worker: finish RDMA setup */
1216 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1217                                   struct smc_clc_msg_accept_confirm *cclc,
1218                                   int local_contact)
1219 {
1220         struct smc_link *link = new_smc->conn.lnk;
1221         int reason_code = 0;
1222
1223         if (local_contact == SMC_FIRST_CONTACT)
1224                 smc_link_save_peer_info(link, cclc);
1225
1226         if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) {
1227                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1228                 goto decline;
1229         }
1230
1231         if (local_contact == SMC_FIRST_CONTACT) {
1232                 if (smc_ib_ready_link(link)) {
1233                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1234                         goto decline;
1235                 }
1236                 /* QP confirmation over RoCE fabric */
1237                 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
1238                 reason_code = smcr_serv_conf_first_link(new_smc);
1239                 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
1240                 if (reason_code)
1241                         goto decline;
1242         }
1243         return 0;
1244
1245 decline:
1246         smc_listen_decline(new_smc, reason_code, local_contact);
1247         return reason_code;
1248 }
1249
1250 /* setup for RDMA connection of server */
1251 static void smc_listen_work(struct work_struct *work)
1252 {
1253         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1254                                                 smc_listen_work);
1255         struct socket *newclcsock = new_smc->clcsock;
1256         struct smc_clc_msg_accept_confirm cclc;
1257         struct smc_clc_msg_proposal *pclc;
1258         struct smc_init_info ini = {0};
1259         bool ism_supported = false;
1260         u8 buf[SMC_CLC_MAX_LEN];
1261         int rc = 0;
1262
1263         if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1264                 return smc_listen_out_err(new_smc);
1265
1266         if (new_smc->use_fallback) {
1267                 smc_listen_out_connected(new_smc);
1268                 return;
1269         }
1270
1271         /* check if peer is smc capable */
1272         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1273                 smc_switch_to_fallback(new_smc);
1274                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1275                 smc_listen_out_connected(new_smc);
1276                 return;
1277         }
1278
1279         /* do inband token exchange -
1280          * wait for and receive SMC Proposal CLC message
1281          */
1282         pclc = (struct smc_clc_msg_proposal *)&buf;
1283         rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1284                               SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1285         if (rc)
1286                 goto out_decl;
1287
1288         /* IPSec connections opt out of SMC-R optimizations */
1289         if (using_ipsec(new_smc)) {
1290                 rc = SMC_CLC_DECL_IPSEC;
1291                 goto out_decl;
1292         }
1293
1294         /* check for matching IP prefix and subnet length */
1295         rc = smc_listen_prfx_check(new_smc, pclc);
1296         if (rc)
1297                 goto out_decl;
1298
1299         /* get vlan id from IP device */
1300         if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1301                 rc = SMC_CLC_DECL_GETVLANERR;
1302                 goto out_decl;
1303         }
1304
1305         mutex_lock(&smc_server_lgr_pending);
1306         smc_close_init(new_smc);
1307         smc_rx_init(new_smc);
1308         smc_tx_init(new_smc);
1309
1310         /* check if ISM is available */
1311         if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1312                 ini.is_smcd = true; /* prepare ISM check */
1313                 rc = smc_find_ism_device(new_smc, &ini);
1314                 if (!rc)
1315                         rc = smc_listen_ism_init(new_smc, pclc, &ini);
1316                 if (!rc)
1317                         ism_supported = true;
1318                 else if (pclc->hdr.path == SMC_TYPE_D)
1319                         goto out_unlock; /* skip RDMA and decline */
1320         }
1321
1322         /* check if RDMA is available */
1323         if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1324                 /* prepare RDMA check */
1325                 ini.is_smcd = false;
1326                 ini.ism_dev = NULL;
1327                 ini.ib_lcl = &pclc->lcl;
1328                 rc = smc_find_rdma_device(new_smc, &ini);
1329                 if (rc) {
1330                         /* no RDMA device found */
1331                         if (pclc->hdr.path == SMC_TYPE_B)
1332                                 /* neither ISM nor RDMA device found */
1333                                 rc = SMC_CLC_DECL_NOSMCDEV;
1334                         goto out_unlock;
1335                 }
1336                 rc = smc_listen_rdma_init(new_smc, &ini);
1337                 if (rc)
1338                         goto out_unlock;
1339                 rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1340                 if (rc)
1341                         goto out_unlock;
1342         }
1343
1344         /* send SMC Accept CLC message */
1345         rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1346         if (rc)
1347                 goto out_unlock;
1348
1349         /* SMC-D does not need this lock any more */
1350         if (ism_supported)
1351                 mutex_unlock(&smc_server_lgr_pending);
1352
1353         /* receive SMC Confirm CLC message */
1354         rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1355                               SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1356         if (rc) {
1357                 if (!ism_supported)
1358                         goto out_unlock;
1359                 goto out_decl;
1360         }
1361
1362         /* finish worker */
1363         if (!ism_supported) {
1364                 rc = smc_listen_rdma_finish(new_smc, &cclc,
1365                                             ini.cln_first_contact);
1366                 mutex_unlock(&smc_server_lgr_pending);
1367                 if (rc)
1368                         return;
1369         }
1370         smc_conn_save_peer_info(new_smc, &cclc);
1371         smc_listen_out_connected(new_smc);
1372         return;
1373
1374 out_unlock:
1375         mutex_unlock(&smc_server_lgr_pending);
1376 out_decl:
1377         smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1378 }
1379
1380 static void smc_tcp_listen_work(struct work_struct *work)
1381 {
1382         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1383                                              tcp_listen_work);
1384         struct sock *lsk = &lsmc->sk;
1385         struct smc_sock *new_smc;
1386         int rc = 0;
1387
1388         lock_sock(lsk);
1389         while (lsk->sk_state == SMC_LISTEN) {
1390                 rc = smc_clcsock_accept(lsmc, &new_smc);
1391                 if (rc)
1392                         goto out;
1393                 if (!new_smc)
1394                         continue;
1395
1396                 new_smc->listen_smc = lsmc;
1397                 new_smc->use_fallback = lsmc->use_fallback;
1398                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1399                 sock_hold(lsk); /* sock_put in smc_listen_work */
1400                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1401                 smc_copy_sock_settings_to_smc(new_smc);
1402                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1403                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1404                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1405                 if (!schedule_work(&new_smc->smc_listen_work))
1406                         sock_put(&new_smc->sk);
1407         }
1408
1409 out:
1410         release_sock(lsk);
1411         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1412 }
1413
1414 static int smc_listen(struct socket *sock, int backlog)
1415 {
1416         struct sock *sk = sock->sk;
1417         struct smc_sock *smc;
1418         int rc;
1419
1420         smc = smc_sk(sk);
1421         lock_sock(sk);
1422
1423         rc = -EINVAL;
1424         if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
1425             smc->connect_nonblock)
1426                 goto out;
1427
1428         rc = 0;
1429         if (sk->sk_state == SMC_LISTEN) {
1430                 sk->sk_max_ack_backlog = backlog;
1431                 goto out;
1432         }
1433         /* some socket options are handled in core, so we could not apply
1434          * them to the clc socket -- copy smc socket options to clc socket
1435          */
1436         smc_copy_sock_settings_to_clc(smc);
1437         if (!smc->use_fallback)
1438                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1439
1440         rc = kernel_listen(smc->clcsock, backlog);
1441         if (rc)
1442                 goto out;
1443         sk->sk_max_ack_backlog = backlog;
1444         sk->sk_ack_backlog = 0;
1445         sk->sk_state = SMC_LISTEN;
1446         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1447         if (!schedule_work(&smc->tcp_listen_work))
1448                 sock_put(sk);
1449
1450 out:
1451         release_sock(sk);
1452         return rc;
1453 }
1454
1455 static int smc_accept(struct socket *sock, struct socket *new_sock,
1456                       int flags, bool kern)
1457 {
1458         struct sock *sk = sock->sk, *nsk;
1459         DECLARE_WAITQUEUE(wait, current);
1460         struct smc_sock *lsmc;
1461         long timeo;
1462         int rc = 0;
1463
1464         lsmc = smc_sk(sk);
1465         sock_hold(sk); /* sock_put below */
1466         lock_sock(sk);
1467
1468         if (lsmc->sk.sk_state != SMC_LISTEN) {
1469                 rc = -EINVAL;
1470                 release_sock(sk);
1471                 goto out;
1472         }
1473
1474         /* Wait for an incoming connection */
1475         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1476         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1477         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1478                 set_current_state(TASK_INTERRUPTIBLE);
1479                 if (!timeo) {
1480                         rc = -EAGAIN;
1481                         break;
1482                 }
1483                 release_sock(sk);
1484                 timeo = schedule_timeout(timeo);
1485                 /* wakeup by sk_data_ready in smc_listen_work() */
1486                 sched_annotate_sleep();
1487                 lock_sock(sk);
1488                 if (signal_pending(current)) {
1489                         rc = sock_intr_errno(timeo);
1490                         break;
1491                 }
1492         }
1493         set_current_state(TASK_RUNNING);
1494         remove_wait_queue(sk_sleep(sk), &wait);
1495
1496         if (!rc)
1497                 rc = sock_error(nsk);
1498         release_sock(sk);
1499         if (rc)
1500                 goto out;
1501
1502         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1503                 /* wait till data arrives on the socket */
1504                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1505                                                                 MSEC_PER_SEC);
1506                 if (smc_sk(nsk)->use_fallback) {
1507                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1508
1509                         lock_sock(clcsk);
1510                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1511                                 sk_wait_data(clcsk, &timeo, NULL);
1512                         release_sock(clcsk);
1513                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1514                         lock_sock(nsk);
1515                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1516                         release_sock(nsk);
1517                 }
1518         }
1519
1520 out:
1521         sock_put(sk); /* sock_hold above */
1522         return rc;
1523 }
1524
1525 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1526                        int peer)
1527 {
1528         struct smc_sock *smc;
1529
1530         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1531             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1532                 return -ENOTCONN;
1533
1534         smc = smc_sk(sock->sk);
1535
1536         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1537 }
1538
1539 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1540 {
1541         struct sock *sk = sock->sk;
1542         struct smc_sock *smc;
1543         int rc = -EPIPE;
1544
1545         smc = smc_sk(sk);
1546         lock_sock(sk);
1547         if ((sk->sk_state != SMC_ACTIVE) &&
1548             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1549             (sk->sk_state != SMC_INIT))
1550                 goto out;
1551
1552         if (msg->msg_flags & MSG_FASTOPEN) {
1553                 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1554                         smc_switch_to_fallback(smc);
1555                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1556                 } else {
1557                         rc = -EINVAL;
1558                         goto out;
1559                 }
1560         }
1561
1562         if (smc->use_fallback)
1563                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1564         else
1565                 rc = smc_tx_sendmsg(smc, msg, len);
1566 out:
1567         release_sock(sk);
1568         return rc;
1569 }
1570
1571 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1572                        int flags)
1573 {
1574         struct sock *sk = sock->sk;
1575         struct smc_sock *smc;
1576         int rc = -ENOTCONN;
1577
1578         smc = smc_sk(sk);
1579         lock_sock(sk);
1580         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1581                 /* socket was connected before, no more data to read */
1582                 rc = 0;
1583                 goto out;
1584         }
1585         if ((sk->sk_state == SMC_INIT) ||
1586             (sk->sk_state == SMC_LISTEN) ||
1587             (sk->sk_state == SMC_CLOSED))
1588                 goto out;
1589
1590         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1591                 rc = 0;
1592                 goto out;
1593         }
1594
1595         if (smc->use_fallback) {
1596                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1597         } else {
1598                 msg->msg_namelen = 0;
1599                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1600         }
1601
1602 out:
1603         release_sock(sk);
1604         return rc;
1605 }
1606
1607 static __poll_t smc_accept_poll(struct sock *parent)
1608 {
1609         struct smc_sock *isk = smc_sk(parent);
1610         __poll_t mask = 0;
1611
1612         spin_lock(&isk->accept_q_lock);
1613         if (!list_empty(&isk->accept_q))
1614                 mask = EPOLLIN | EPOLLRDNORM;
1615         spin_unlock(&isk->accept_q_lock);
1616
1617         return mask;
1618 }
1619
1620 static __poll_t smc_poll(struct file *file, struct socket *sock,
1621                              poll_table *wait)
1622 {
1623         struct sock *sk = sock->sk;
1624         struct smc_sock *smc;
1625         __poll_t mask = 0;
1626
1627         if (!sk)
1628                 return EPOLLNVAL;
1629
1630         smc = smc_sk(sock->sk);
1631         if (smc->use_fallback) {
1632                 /* delegate to CLC child sock */
1633                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1634                 sk->sk_err = smc->clcsock->sk->sk_err;
1635         } else {
1636                 if (sk->sk_state != SMC_CLOSED)
1637                         sock_poll_wait(file, sock, wait);
1638                 if (sk->sk_err)
1639                         mask |= EPOLLERR;
1640                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1641                     (sk->sk_state == SMC_CLOSED))
1642                         mask |= EPOLLHUP;
1643                 if (sk->sk_state == SMC_LISTEN) {
1644                         /* woken up by sk_data_ready in smc_listen_work() */
1645                         mask |= smc_accept_poll(sk);
1646                 } else if (smc->use_fallback) { /* as result of connect_work()*/
1647                         mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1648                                                            wait);
1649                         sk->sk_err = smc->clcsock->sk->sk_err;
1650                 } else {
1651                         if ((sk->sk_state != SMC_INIT &&
1652                              atomic_read(&smc->conn.sndbuf_space)) ||
1653                             sk->sk_shutdown & SEND_SHUTDOWN) {
1654                                 mask |= EPOLLOUT | EPOLLWRNORM;
1655                         } else {
1656                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1657                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1658                         }
1659                         if (atomic_read(&smc->conn.bytes_to_rcv))
1660                                 mask |= EPOLLIN | EPOLLRDNORM;
1661                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1662                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1663                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1664                                 mask |= EPOLLIN;
1665                         if (smc->conn.urg_state == SMC_URG_VALID)
1666                                 mask |= EPOLLPRI;
1667                 }
1668         }
1669
1670         return mask;
1671 }
1672
1673 static int smc_shutdown(struct socket *sock, int how)
1674 {
1675         struct sock *sk = sock->sk;
1676         struct smc_sock *smc;
1677         int rc = -EINVAL;
1678         int rc1 = 0;
1679
1680         smc = smc_sk(sk);
1681
1682         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1683                 return rc;
1684
1685         lock_sock(sk);
1686
1687         rc = -ENOTCONN;
1688         if ((sk->sk_state != SMC_ACTIVE) &&
1689             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1690             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1691             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1692             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1693             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1694                 goto out;
1695         if (smc->use_fallback) {
1696                 rc = kernel_sock_shutdown(smc->clcsock, how);
1697                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1698                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1699                         sk->sk_state = SMC_CLOSED;
1700                 goto out;
1701         }
1702         switch (how) {
1703         case SHUT_RDWR:         /* shutdown in both directions */
1704                 rc = smc_close_active(smc);
1705                 break;
1706         case SHUT_WR:
1707                 rc = smc_close_shutdown_write(smc);
1708                 break;
1709         case SHUT_RD:
1710                 rc = 0;
1711                 /* nothing more to do because peer is not involved */
1712                 break;
1713         }
1714         if (smc->clcsock)
1715                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1716         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1717         sk->sk_shutdown |= how + 1;
1718
1719 out:
1720         release_sock(sk);
1721         return rc ? rc : rc1;
1722 }
1723
1724 static int smc_setsockopt(struct socket *sock, int level, int optname,
1725                           char __user *optval, unsigned int optlen)
1726 {
1727         struct sock *sk = sock->sk;
1728         struct smc_sock *smc;
1729         int val, rc;
1730
1731         smc = smc_sk(sk);
1732
1733         /* generic setsockopts reaching us here always apply to the
1734          * CLC socket
1735          */
1736         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1737                                            optval, optlen);
1738         if (smc->clcsock->sk->sk_err) {
1739                 sk->sk_err = smc->clcsock->sk->sk_err;
1740                 sk->sk_error_report(sk);
1741         }
1742
1743         if (optlen < sizeof(int))
1744                 return -EINVAL;
1745         if (get_user(val, (int __user *)optval))
1746                 return -EFAULT;
1747
1748         lock_sock(sk);
1749         if (rc || smc->use_fallback)
1750                 goto out;
1751         switch (optname) {
1752         case TCP_ULP:
1753         case TCP_FASTOPEN:
1754         case TCP_FASTOPEN_CONNECT:
1755         case TCP_FASTOPEN_KEY:
1756         case TCP_FASTOPEN_NO_COOKIE:
1757                 /* option not supported by SMC */
1758                 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1759                         smc_switch_to_fallback(smc);
1760                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1761                 } else {
1762                         rc = -EINVAL;
1763                 }
1764                 break;
1765         case TCP_NODELAY:
1766                 if (sk->sk_state != SMC_INIT &&
1767                     sk->sk_state != SMC_LISTEN &&
1768                     sk->sk_state != SMC_CLOSED) {
1769                         if (val)
1770                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1771                                                  0);
1772                 }
1773                 break;
1774         case TCP_CORK:
1775                 if (sk->sk_state != SMC_INIT &&
1776                     sk->sk_state != SMC_LISTEN &&
1777                     sk->sk_state != SMC_CLOSED) {
1778                         if (!val)
1779                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1780                                                  0);
1781                 }
1782                 break;
1783         case TCP_DEFER_ACCEPT:
1784                 smc->sockopt_defer_accept = val;
1785                 break;
1786         default:
1787                 break;
1788         }
1789 out:
1790         release_sock(sk);
1791
1792         return rc;
1793 }
1794
1795 static int smc_getsockopt(struct socket *sock, int level, int optname,
1796                           char __user *optval, int __user *optlen)
1797 {
1798         struct smc_sock *smc;
1799
1800         smc = smc_sk(sock->sk);
1801         /* socket options apply to the CLC socket */
1802         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1803                                              optval, optlen);
1804 }
1805
1806 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1807                      unsigned long arg)
1808 {
1809         union smc_host_cursor cons, urg;
1810         struct smc_connection *conn;
1811         struct smc_sock *smc;
1812         int answ;
1813
1814         smc = smc_sk(sock->sk);
1815         conn = &smc->conn;
1816         lock_sock(&smc->sk);
1817         if (smc->use_fallback) {
1818                 if (!smc->clcsock) {
1819                         release_sock(&smc->sk);
1820                         return -EBADF;
1821                 }
1822                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1823                 release_sock(&smc->sk);
1824                 return answ;
1825         }
1826         switch (cmd) {
1827         case SIOCINQ: /* same as FIONREAD */
1828                 if (smc->sk.sk_state == SMC_LISTEN) {
1829                         release_sock(&smc->sk);
1830                         return -EINVAL;
1831                 }
1832                 if (smc->sk.sk_state == SMC_INIT ||
1833                     smc->sk.sk_state == SMC_CLOSED)
1834                         answ = 0;
1835                 else
1836                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1837                 break;
1838         case SIOCOUTQ:
1839                 /* output queue size (not send + not acked) */
1840                 if (smc->sk.sk_state == SMC_LISTEN) {
1841                         release_sock(&smc->sk);
1842                         return -EINVAL;
1843                 }
1844                 if (smc->sk.sk_state == SMC_INIT ||
1845                     smc->sk.sk_state == SMC_CLOSED)
1846                         answ = 0;
1847                 else
1848                         answ = smc->conn.sndbuf_desc->len -
1849                                         atomic_read(&smc->conn.sndbuf_space);
1850                 break;
1851         case SIOCOUTQNSD:
1852                 /* output queue size (not send only) */
1853                 if (smc->sk.sk_state == SMC_LISTEN) {
1854                         release_sock(&smc->sk);
1855                         return -EINVAL;
1856                 }
1857                 if (smc->sk.sk_state == SMC_INIT ||
1858                     smc->sk.sk_state == SMC_CLOSED)
1859                         answ = 0;
1860                 else
1861                         answ = smc_tx_prepared_sends(&smc->conn);
1862                 break;
1863         case SIOCATMARK:
1864                 if (smc->sk.sk_state == SMC_LISTEN) {
1865                         release_sock(&smc->sk);
1866                         return -EINVAL;
1867                 }
1868                 if (smc->sk.sk_state == SMC_INIT ||
1869                     smc->sk.sk_state == SMC_CLOSED) {
1870                         answ = 0;
1871                 } else {
1872                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1873                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1874                         answ = smc_curs_diff(conn->rmb_desc->len,
1875                                              &cons, &urg) == 1;
1876                 }
1877                 break;
1878         default:
1879                 release_sock(&smc->sk);
1880                 return -ENOIOCTLCMD;
1881         }
1882         release_sock(&smc->sk);
1883
1884         return put_user(answ, (int __user *)arg);
1885 }
1886
1887 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1888                             int offset, size_t size, int flags)
1889 {
1890         struct sock *sk = sock->sk;
1891         struct smc_sock *smc;
1892         int rc = -EPIPE;
1893
1894         smc = smc_sk(sk);
1895         lock_sock(sk);
1896         if (sk->sk_state != SMC_ACTIVE) {
1897                 release_sock(sk);
1898                 goto out;
1899         }
1900         release_sock(sk);
1901         if (smc->use_fallback)
1902                 rc = kernel_sendpage(smc->clcsock, page, offset,
1903                                      size, flags);
1904         else
1905                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1906
1907 out:
1908         return rc;
1909 }
1910
1911 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1912  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1913  * updates till whenever a respective page has been fully processed.
1914  * Note that subsequent recv() calls have to wait till all splice() processing
1915  * completed.
1916  */
1917 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1918                                struct pipe_inode_info *pipe, size_t len,
1919                                unsigned int flags)
1920 {
1921         struct sock *sk = sock->sk;
1922         struct smc_sock *smc;
1923         int rc = -ENOTCONN;
1924
1925         smc = smc_sk(sk);
1926         lock_sock(sk);
1927         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1928                 /* socket was connected before, no more data to read */
1929                 rc = 0;
1930                 goto out;
1931         }
1932         if (sk->sk_state == SMC_INIT ||
1933             sk->sk_state == SMC_LISTEN ||
1934             sk->sk_state == SMC_CLOSED)
1935                 goto out;
1936
1937         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1938                 rc = 0;
1939                 goto out;
1940         }
1941
1942         if (smc->use_fallback) {
1943                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1944                                                     pipe, len, flags);
1945         } else {
1946                 if (*ppos) {
1947                         rc = -ESPIPE;
1948                         goto out;
1949                 }
1950                 if (flags & SPLICE_F_NONBLOCK)
1951                         flags = MSG_DONTWAIT;
1952                 else
1953                         flags = 0;
1954                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1955         }
1956 out:
1957         release_sock(sk);
1958
1959         return rc;
1960 }
1961
1962 /* must look like tcp */
1963 static const struct proto_ops smc_sock_ops = {
1964         .family         = PF_SMC,
1965         .owner          = THIS_MODULE,
1966         .release        = smc_release,
1967         .bind           = smc_bind,
1968         .connect        = smc_connect,
1969         .socketpair     = sock_no_socketpair,
1970         .accept         = smc_accept,
1971         .getname        = smc_getname,
1972         .poll           = smc_poll,
1973         .ioctl          = smc_ioctl,
1974         .listen         = smc_listen,
1975         .shutdown       = smc_shutdown,
1976         .setsockopt     = smc_setsockopt,
1977         .getsockopt     = smc_getsockopt,
1978         .sendmsg        = smc_sendmsg,
1979         .recvmsg        = smc_recvmsg,
1980         .mmap           = sock_no_mmap,
1981         .sendpage       = smc_sendpage,
1982         .splice_read    = smc_splice_read,
1983 };
1984
1985 static int smc_create(struct net *net, struct socket *sock, int protocol,
1986                       int kern)
1987 {
1988         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1989         struct smc_sock *smc;
1990         struct sock *sk;
1991         int rc;
1992
1993         rc = -ESOCKTNOSUPPORT;
1994         if (sock->type != SOCK_STREAM)
1995                 goto out;
1996
1997         rc = -EPROTONOSUPPORT;
1998         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1999                 goto out;
2000
2001         rc = -ENOBUFS;
2002         sock->ops = &smc_sock_ops;
2003         sk = smc_sock_alloc(net, sock, protocol);
2004         if (!sk)
2005                 goto out;
2006
2007         /* create internal TCP socket for CLC handshake and fallback */
2008         smc = smc_sk(sk);
2009         smc->use_fallback = false; /* assume rdma capability first */
2010         smc->fallback_rsn = 0;
2011         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
2012                               &smc->clcsock);
2013         if (rc) {
2014                 sk_common_release(sk);
2015                 goto out;
2016         }
2017         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
2018         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
2019
2020 out:
2021         return rc;
2022 }
2023
2024 static const struct net_proto_family smc_sock_family_ops = {
2025         .family = PF_SMC,
2026         .owner  = THIS_MODULE,
2027         .create = smc_create,
2028 };
2029
2030 unsigned int smc_net_id;
2031
2032 static __net_init int smc_net_init(struct net *net)
2033 {
2034         return smc_pnet_net_init(net);
2035 }
2036
2037 static void __net_exit smc_net_exit(struct net *net)
2038 {
2039         smc_pnet_net_exit(net);
2040 }
2041
2042 static struct pernet_operations smc_net_ops = {
2043         .init = smc_net_init,
2044         .exit = smc_net_exit,
2045         .id   = &smc_net_id,
2046         .size = sizeof(struct smc_net),
2047 };
2048
2049 static int __init smc_init(void)
2050 {
2051         int rc;
2052
2053         rc = register_pernet_subsys(&smc_net_ops);
2054         if (rc)
2055                 return rc;
2056
2057         rc = smc_pnet_init();
2058         if (rc)
2059                 goto out_pernet_subsys;
2060
2061         rc = smc_core_init();
2062         if (rc) {
2063                 pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
2064                 goto out_pnet;
2065         }
2066
2067         rc = smc_llc_init();
2068         if (rc) {
2069                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2070                 goto out_core;
2071         }
2072
2073         rc = smc_cdc_init();
2074         if (rc) {
2075                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2076                 goto out_core;
2077         }
2078
2079         rc = proto_register(&smc_proto, 1);
2080         if (rc) {
2081                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2082                 goto out_core;
2083         }
2084
2085         rc = proto_register(&smc_proto6, 1);
2086         if (rc) {
2087                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2088                 goto out_proto;
2089         }
2090
2091         rc = sock_register(&smc_sock_family_ops);
2092         if (rc) {
2093                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
2094                 goto out_proto6;
2095         }
2096         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2097         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2098
2099         rc = smc_ib_register_client();
2100         if (rc) {
2101                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
2102                 goto out_sock;
2103         }
2104
2105         static_branch_enable(&tcp_have_smc);
2106         return 0;
2107
2108 out_sock:
2109         sock_unregister(PF_SMC);
2110 out_proto6:
2111         proto_unregister(&smc_proto6);
2112 out_proto:
2113         proto_unregister(&smc_proto);
2114 out_core:
2115         smc_core_exit();
2116 out_pnet:
2117         smc_pnet_exit();
2118 out_pernet_subsys:
2119         unregister_pernet_subsys(&smc_net_ops);
2120
2121         return rc;
2122 }
2123
2124 static void __exit smc_exit(void)
2125 {
2126         static_branch_disable(&tcp_have_smc);
2127         sock_unregister(PF_SMC);
2128         smc_core_exit();
2129         smc_ib_unregister_client();
2130         proto_unregister(&smc_proto6);
2131         proto_unregister(&smc_proto);
2132         smc_pnet_exit();
2133         unregister_pernet_subsys(&smc_net_ops);
2134         rcu_barrier();
2135 }
2136
2137 module_init(smc_init);
2138 module_exit(smc_exit);
2139
2140 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2141 MODULE_DESCRIPTION("smc socket address family");
2142 MODULE_LICENSE("GPL");
2143 MODULE_ALIAS_NETPROTO(PF_SMC);