Merge tag 'am654-fixes-for-v5.2' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / net / smc / af_smc.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28
29 #include <net/sock.h>
30 #include <net/tcp.h>
31 #include <net/smc.h>
32 #include <asm/ioctls.h>
33
34 #include <net/net_namespace.h>
35 #include <net/netns/generic.h>
36 #include "smc_netns.h"
37
38 #include "smc.h"
39 #include "smc_clc.h"
40 #include "smc_llc.h"
41 #include "smc_cdc.h"
42 #include "smc_core.h"
43 #include "smc_ib.h"
44 #include "smc_ism.h"
45 #include "smc_pnet.h"
46 #include "smc_tx.h"
47 #include "smc_rx.h"
48 #include "smc_close.h"
49
50 static DEFINE_MUTEX(smc_server_lgr_pending);    /* serialize link group
51                                                  * creation on server
52                                                  */
53 static DEFINE_MUTEX(smc_client_lgr_pending);    /* serialize link group
54                                                  * creation on client
55                                                  */
56
57 static void smc_tcp_listen_work(struct work_struct *);
58 static void smc_connect_work(struct work_struct *);
59
60 static void smc_set_keepalive(struct sock *sk, int val)
61 {
62         struct smc_sock *smc = smc_sk(sk);
63
64         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
65 }
66
67 static struct smc_hashinfo smc_v4_hashinfo = {
68         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
69 };
70
71 static struct smc_hashinfo smc_v6_hashinfo = {
72         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
73 };
74
75 int smc_hash_sk(struct sock *sk)
76 {
77         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
78         struct hlist_head *head;
79
80         head = &h->ht;
81
82         write_lock_bh(&h->lock);
83         sk_add_node(sk, head);
84         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
85         write_unlock_bh(&h->lock);
86
87         return 0;
88 }
89 EXPORT_SYMBOL_GPL(smc_hash_sk);
90
91 void smc_unhash_sk(struct sock *sk)
92 {
93         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
94
95         write_lock_bh(&h->lock);
96         if (sk_del_node_init(sk))
97                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
98         write_unlock_bh(&h->lock);
99 }
100 EXPORT_SYMBOL_GPL(smc_unhash_sk);
101
102 struct proto smc_proto = {
103         .name           = "SMC",
104         .owner          = THIS_MODULE,
105         .keepalive      = smc_set_keepalive,
106         .hash           = smc_hash_sk,
107         .unhash         = smc_unhash_sk,
108         .obj_size       = sizeof(struct smc_sock),
109         .h.smc_hash     = &smc_v4_hashinfo,
110         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
111 };
112 EXPORT_SYMBOL_GPL(smc_proto);
113
114 struct proto smc_proto6 = {
115         .name           = "SMC6",
116         .owner          = THIS_MODULE,
117         .keepalive      = smc_set_keepalive,
118         .hash           = smc_hash_sk,
119         .unhash         = smc_unhash_sk,
120         .obj_size       = sizeof(struct smc_sock),
121         .h.smc_hash     = &smc_v6_hashinfo,
122         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
123 };
124 EXPORT_SYMBOL_GPL(smc_proto6);
125
126 static int smc_release(struct socket *sock)
127 {
128         struct sock *sk = sock->sk;
129         struct smc_sock *smc;
130         int rc = 0;
131
132         if (!sk)
133                 goto out;
134
135         smc = smc_sk(sk);
136
137         /* cleanup for a dangling non-blocking connect */
138         if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
139                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
140         flush_work(&smc->connect_work);
141
142         if (sk->sk_state == SMC_LISTEN)
143                 /* smc_close_non_accepted() is called and acquires
144                  * sock lock for child sockets again
145                  */
146                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
147         else
148                 lock_sock(sk);
149
150         if (!smc->use_fallback) {
151                 rc = smc_close_active(smc);
152                 sock_set_flag(sk, SOCK_DEAD);
153                 sk->sk_shutdown |= SHUTDOWN_MASK;
154         } else {
155                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
156                         sock_put(sk); /* passive closing */
157                 if (sk->sk_state == SMC_LISTEN) {
158                         /* wake up clcsock accept */
159                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
160                 }
161                 sk->sk_state = SMC_CLOSED;
162                 sk->sk_state_change(sk);
163         }
164
165         sk->sk_prot->unhash(sk);
166
167         if (sk->sk_state == SMC_CLOSED) {
168                 if (smc->clcsock) {
169                         release_sock(sk);
170                         smc_clcsock_release(smc);
171                         lock_sock(sk);
172                 }
173                 if (!smc->use_fallback)
174                         smc_conn_free(&smc->conn);
175         }
176
177         /* detach socket */
178         sock_orphan(sk);
179         sock->sk = NULL;
180         release_sock(sk);
181
182         sock_put(sk); /* final sock_put */
183 out:
184         return rc;
185 }
186
187 static void smc_destruct(struct sock *sk)
188 {
189         if (sk->sk_state != SMC_CLOSED)
190                 return;
191         if (!sock_flag(sk, SOCK_DEAD))
192                 return;
193
194         sk_refcnt_debug_dec(sk);
195 }
196
197 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
198                                    int protocol)
199 {
200         struct smc_sock *smc;
201         struct proto *prot;
202         struct sock *sk;
203
204         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
205         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
206         if (!sk)
207                 return NULL;
208
209         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
210         sk->sk_state = SMC_INIT;
211         sk->sk_destruct = smc_destruct;
212         sk->sk_protocol = protocol;
213         smc = smc_sk(sk);
214         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
215         INIT_WORK(&smc->connect_work, smc_connect_work);
216         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
217         INIT_LIST_HEAD(&smc->accept_q);
218         spin_lock_init(&smc->accept_q_lock);
219         spin_lock_init(&smc->conn.send_lock);
220         sk->sk_prot->hash(sk);
221         sk_refcnt_debug_inc(sk);
222         mutex_init(&smc->clcsock_release_lock);
223
224         return sk;
225 }
226
227 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
228                     int addr_len)
229 {
230         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
231         struct sock *sk = sock->sk;
232         struct smc_sock *smc;
233         int rc;
234
235         smc = smc_sk(sk);
236
237         /* replicate tests from inet_bind(), to be safe wrt. future changes */
238         rc = -EINVAL;
239         if (addr_len < sizeof(struct sockaddr_in))
240                 goto out;
241
242         rc = -EAFNOSUPPORT;
243         if (addr->sin_family != AF_INET &&
244             addr->sin_family != AF_INET6 &&
245             addr->sin_family != AF_UNSPEC)
246                 goto out;
247         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
248         if (addr->sin_family == AF_UNSPEC &&
249             addr->sin_addr.s_addr != htonl(INADDR_ANY))
250                 goto out;
251
252         lock_sock(sk);
253
254         /* Check if socket is already active */
255         rc = -EINVAL;
256         if (sk->sk_state != SMC_INIT)
257                 goto out_rel;
258
259         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
260         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
261
262 out_rel:
263         release_sock(sk);
264 out:
265         return rc;
266 }
267
268 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
269                                    unsigned long mask)
270 {
271         /* options we don't get control via setsockopt for */
272         nsk->sk_type = osk->sk_type;
273         nsk->sk_sndbuf = osk->sk_sndbuf;
274         nsk->sk_rcvbuf = osk->sk_rcvbuf;
275         nsk->sk_sndtimeo = osk->sk_sndtimeo;
276         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
277         nsk->sk_mark = osk->sk_mark;
278         nsk->sk_priority = osk->sk_priority;
279         nsk->sk_rcvlowat = osk->sk_rcvlowat;
280         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
281         nsk->sk_err = osk->sk_err;
282
283         nsk->sk_flags &= ~mask;
284         nsk->sk_flags |= osk->sk_flags & mask;
285 }
286
287 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
288                              (1UL << SOCK_KEEPOPEN) | \
289                              (1UL << SOCK_LINGER) | \
290                              (1UL << SOCK_BROADCAST) | \
291                              (1UL << SOCK_TIMESTAMP) | \
292                              (1UL << SOCK_DBG) | \
293                              (1UL << SOCK_RCVTSTAMP) | \
294                              (1UL << SOCK_RCVTSTAMPNS) | \
295                              (1UL << SOCK_LOCALROUTE) | \
296                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
297                              (1UL << SOCK_RXQ_OVFL) | \
298                              (1UL << SOCK_WIFI_STATUS) | \
299                              (1UL << SOCK_NOFCS) | \
300                              (1UL << SOCK_FILTER_LOCKED) | \
301                              (1UL << SOCK_TSTAMP_NEW))
302 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
303  * clc socket (since smc is not called for these options from net/core)
304  */
305 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
306 {
307         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
308 }
309
310 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
311                              (1UL << SOCK_KEEPOPEN) | \
312                              (1UL << SOCK_LINGER) | \
313                              (1UL << SOCK_DBG))
314 /* copy only settings and flags relevant for smc from clc to smc socket */
315 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
316 {
317         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
318 }
319
320 /* register a new rmb, send confirm_rkey msg to register with peer */
321 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
322                        bool conf_rkey)
323 {
324         if (!rmb_desc->wr_reg) {
325                 /* register memory region for new rmb */
326                 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
327                         rmb_desc->regerr = 1;
328                         return -EFAULT;
329                 }
330                 rmb_desc->wr_reg = 1;
331         }
332         if (!conf_rkey)
333                 return 0;
334         /* exchange confirm_rkey msg with peer */
335         if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
336                 rmb_desc->regerr = 1;
337                 return -EFAULT;
338         }
339         return 0;
340 }
341
342 static int smc_clnt_conf_first_link(struct smc_sock *smc)
343 {
344         struct net *net = sock_net(smc->clcsock->sk);
345         struct smc_link_group *lgr = smc->conn.lgr;
346         struct smc_link *link;
347         int rest;
348         int rc;
349
350         link = &lgr->lnk[SMC_SINGLE_LINK];
351         /* receive CONFIRM LINK request from server over RoCE fabric */
352         rest = wait_for_completion_interruptible_timeout(
353                 &link->llc_confirm,
354                 SMC_LLC_WAIT_FIRST_TIME);
355         if (rest <= 0) {
356                 struct smc_clc_msg_decline dclc;
357
358                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
359                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
360                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
361         }
362
363         if (link->llc_confirm_rc)
364                 return SMC_CLC_DECL_RMBE_EC;
365
366         rc = smc_ib_modify_qp_rts(link);
367         if (rc)
368                 return SMC_CLC_DECL_ERR_RDYLNK;
369
370         smc_wr_remember_qp_attr(link);
371
372         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
373                 return SMC_CLC_DECL_ERR_REGRMB;
374
375         /* send CONFIRM LINK response over RoCE fabric */
376         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
377         if (rc < 0)
378                 return SMC_CLC_DECL_TIMEOUT_CL;
379
380         /* receive ADD LINK request from server over RoCE fabric */
381         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
382                                                          SMC_LLC_WAIT_TIME);
383         if (rest <= 0) {
384                 struct smc_clc_msg_decline dclc;
385
386                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
387                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
388                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
389         }
390
391         /* send add link reject message, only one link supported for now */
392         rc = smc_llc_send_add_link(link,
393                                    link->smcibdev->mac[link->ibport - 1],
394                                    link->gid, SMC_LLC_RESP);
395         if (rc < 0)
396                 return SMC_CLC_DECL_TIMEOUT_AL;
397
398         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
399
400         return 0;
401 }
402
403 static void smcr_conn_save_peer_info(struct smc_sock *smc,
404                                      struct smc_clc_msg_accept_confirm *clc)
405 {
406         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
407
408         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
409         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
410         smc->conn.peer_rmbe_size = bufsize;
411         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
412         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
413 }
414
415 static void smcd_conn_save_peer_info(struct smc_sock *smc,
416                                      struct smc_clc_msg_accept_confirm *clc)
417 {
418         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
419
420         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
421         smc->conn.peer_token = clc->token;
422         /* msg header takes up space in the buffer */
423         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
424         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
425         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
426 }
427
428 static void smc_conn_save_peer_info(struct smc_sock *smc,
429                                     struct smc_clc_msg_accept_confirm *clc)
430 {
431         if (smc->conn.lgr->is_smcd)
432                 smcd_conn_save_peer_info(smc, clc);
433         else
434                 smcr_conn_save_peer_info(smc, clc);
435 }
436
437 static void smc_link_save_peer_info(struct smc_link *link,
438                                     struct smc_clc_msg_accept_confirm *clc)
439 {
440         link->peer_qpn = ntoh24(clc->qpn);
441         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
442         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
443         link->peer_psn = ntoh24(clc->psn);
444         link->peer_mtu = clc->qp_mtu;
445 }
446
447 static void smc_switch_to_fallback(struct smc_sock *smc)
448 {
449         smc->use_fallback = true;
450         if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
451                 smc->clcsock->file = smc->sk.sk_socket->file;
452                 smc->clcsock->file->private_data = smc->clcsock;
453         }
454 }
455
456 /* fall back during connect */
457 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
458 {
459         smc_switch_to_fallback(smc);
460         smc->fallback_rsn = reason_code;
461         smc_copy_sock_settings_to_clc(smc);
462         smc->connect_nonblock = 0;
463         if (smc->sk.sk_state == SMC_INIT)
464                 smc->sk.sk_state = SMC_ACTIVE;
465         return 0;
466 }
467
468 /* decline and fall back during connect */
469 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
470 {
471         int rc;
472
473         if (reason_code < 0) { /* error, fallback is not possible */
474                 if (smc->sk.sk_state == SMC_INIT)
475                         sock_put(&smc->sk); /* passive closing */
476                 return reason_code;
477         }
478         if (reason_code != SMC_CLC_DECL_PEERDECL) {
479                 rc = smc_clc_send_decline(smc, reason_code);
480                 if (rc < 0) {
481                         if (smc->sk.sk_state == SMC_INIT)
482                                 sock_put(&smc->sk); /* passive closing */
483                         return rc;
484                 }
485         }
486         return smc_connect_fallback(smc, reason_code);
487 }
488
489 /* abort connecting */
490 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
491                              int local_contact)
492 {
493         if (local_contact == SMC_FIRST_CONTACT)
494                 smc_lgr_forget(smc->conn.lgr);
495         if (smc->conn.lgr->is_smcd)
496                 /* there is only one lgr role for SMC-D; use server lock */
497                 mutex_unlock(&smc_server_lgr_pending);
498         else
499                 mutex_unlock(&smc_client_lgr_pending);
500
501         smc_conn_free(&smc->conn);
502         smc->connect_nonblock = 0;
503         return reason_code;
504 }
505
506 /* check if there is a rdma device available for this connection. */
507 /* called for connect and listen */
508 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
509 {
510         /* PNET table look up: search active ib_device and port
511          * within same PNETID that also contains the ethernet device
512          * used for the internal TCP socket
513          */
514         smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
515         if (!ini->ib_dev)
516                 return SMC_CLC_DECL_NOSMCRDEV;
517         return 0;
518 }
519
520 /* check if there is an ISM device available for this connection. */
521 /* called for connect and listen */
522 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
523 {
524         /* Find ISM device with same PNETID as connecting interface  */
525         smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
526         if (!ini->ism_dev)
527                 return SMC_CLC_DECL_NOSMCDDEV;
528         return 0;
529 }
530
531 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
532 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
533                                       struct smc_init_info *ini)
534 {
535         if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
536                 return SMC_CLC_DECL_ISMVLANERR;
537         return 0;
538 }
539
540 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
541  * used, the VLAN ID will be registered again during the connection setup.
542  */
543 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
544                                         struct smc_init_info *ini)
545 {
546         if (!is_smcd)
547                 return 0;
548         if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
549                 return SMC_CLC_DECL_CNFERR;
550         return 0;
551 }
552
553 /* CLC handshake during connect */
554 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
555                            struct smc_clc_msg_accept_confirm *aclc,
556                            struct smc_init_info *ini)
557 {
558         int rc = 0;
559
560         /* do inband token exchange */
561         rc = smc_clc_send_proposal(smc, smc_type, ini);
562         if (rc)
563                 return rc;
564         /* receive SMC Accept CLC message */
565         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
566                                 CLC_WAIT_TIME);
567 }
568
569 /* setup for RDMA connection of client */
570 static int smc_connect_rdma(struct smc_sock *smc,
571                             struct smc_clc_msg_accept_confirm *aclc,
572                             struct smc_init_info *ini)
573 {
574         struct smc_link *link;
575         int reason_code = 0;
576
577         ini->is_smcd = false;
578         ini->ib_lcl = &aclc->lcl;
579         ini->ib_clcqpn = ntoh24(aclc->qpn);
580         ini->srv_first_contact = aclc->hdr.flag;
581
582         mutex_lock(&smc_client_lgr_pending);
583         reason_code = smc_conn_create(smc, ini);
584         if (reason_code) {
585                 mutex_unlock(&smc_client_lgr_pending);
586                 return reason_code;
587         }
588         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
589
590         smc_conn_save_peer_info(smc, aclc);
591
592         /* create send buffer and rmb */
593         if (smc_buf_create(smc, false))
594                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
595                                          ini->cln_first_contact);
596
597         if (ini->cln_first_contact == SMC_FIRST_CONTACT)
598                 smc_link_save_peer_info(link, aclc);
599
600         if (smc_rmb_rtoken_handling(&smc->conn, aclc))
601                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
602                                          ini->cln_first_contact);
603
604         smc_close_init(smc);
605         smc_rx_init(smc);
606
607         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
608                 if (smc_ib_ready_link(link))
609                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
610                                                  ini->cln_first_contact);
611         } else {
612                 if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
613                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
614                                                  ini->cln_first_contact);
615         }
616         smc_rmb_sync_sg_for_device(&smc->conn);
617
618         reason_code = smc_clc_send_confirm(smc);
619         if (reason_code)
620                 return smc_connect_abort(smc, reason_code,
621                                          ini->cln_first_contact);
622
623         smc_tx_init(smc);
624
625         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
626                 /* QP confirmation over RoCE fabric */
627                 reason_code = smc_clnt_conf_first_link(smc);
628                 if (reason_code)
629                         return smc_connect_abort(smc, reason_code,
630                                                  ini->cln_first_contact);
631         }
632         mutex_unlock(&smc_client_lgr_pending);
633
634         smc_copy_sock_settings_to_clc(smc);
635         smc->connect_nonblock = 0;
636         if (smc->sk.sk_state == SMC_INIT)
637                 smc->sk.sk_state = SMC_ACTIVE;
638
639         return 0;
640 }
641
642 /* setup for ISM connection of client */
643 static int smc_connect_ism(struct smc_sock *smc,
644                            struct smc_clc_msg_accept_confirm *aclc,
645                            struct smc_init_info *ini)
646 {
647         int rc = 0;
648
649         ini->is_smcd = true;
650         ini->ism_gid = aclc->gid;
651         ini->srv_first_contact = aclc->hdr.flag;
652
653         /* there is only one lgr role for SMC-D; use server lock */
654         mutex_lock(&smc_server_lgr_pending);
655         rc = smc_conn_create(smc, ini);
656         if (rc) {
657                 mutex_unlock(&smc_server_lgr_pending);
658                 return rc;
659         }
660
661         /* Create send and receive buffers */
662         if (smc_buf_create(smc, true))
663                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
664                                          ini->cln_first_contact);
665
666         smc_conn_save_peer_info(smc, aclc);
667         smc_close_init(smc);
668         smc_rx_init(smc);
669         smc_tx_init(smc);
670
671         rc = smc_clc_send_confirm(smc);
672         if (rc)
673                 return smc_connect_abort(smc, rc, ini->cln_first_contact);
674         mutex_unlock(&smc_server_lgr_pending);
675
676         smc_copy_sock_settings_to_clc(smc);
677         smc->connect_nonblock = 0;
678         if (smc->sk.sk_state == SMC_INIT)
679                 smc->sk.sk_state = SMC_ACTIVE;
680
681         return 0;
682 }
683
684 /* perform steps before actually connecting */
685 static int __smc_connect(struct smc_sock *smc)
686 {
687         bool ism_supported = false, rdma_supported = false;
688         struct smc_clc_msg_accept_confirm aclc;
689         struct smc_init_info ini = {0};
690         int smc_type;
691         int rc = 0;
692
693         sock_hold(&smc->sk); /* sock put in passive closing */
694
695         if (smc->use_fallback)
696                 return smc_connect_fallback(smc, smc->fallback_rsn);
697
698         /* if peer has not signalled SMC-capability, fall back */
699         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
700                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
701
702         /* IPSec connections opt out of SMC-R optimizations */
703         if (using_ipsec(smc))
704                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
705
706         /* get vlan id from IP device */
707         if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
708                 return smc_connect_decline_fallback(smc,
709                                                     SMC_CLC_DECL_GETVLANERR);
710
711         /* check if there is an ism device available */
712         if (!smc_find_ism_device(smc, &ini) &&
713             !smc_connect_ism_vlan_setup(smc, &ini)) {
714                 /* ISM is supported for this connection */
715                 ism_supported = true;
716                 smc_type = SMC_TYPE_D;
717         }
718
719         /* check if there is a rdma device available */
720         if (!smc_find_rdma_device(smc, &ini)) {
721                 /* RDMA is supported for this connection */
722                 rdma_supported = true;
723                 if (ism_supported)
724                         smc_type = SMC_TYPE_B; /* both */
725                 else
726                         smc_type = SMC_TYPE_R; /* only RDMA */
727         }
728
729         /* if neither ISM nor RDMA are supported, fallback */
730         if (!rdma_supported && !ism_supported)
731                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
732
733         /* perform CLC handshake */
734         rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
735         if (rc) {
736                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
737                 return smc_connect_decline_fallback(smc, rc);
738         }
739
740         /* depending on previous steps, connect using rdma or ism */
741         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
742                 rc = smc_connect_rdma(smc, &aclc, &ini);
743         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
744                 rc = smc_connect_ism(smc, &aclc, &ini);
745         else
746                 rc = SMC_CLC_DECL_MODEUNSUPP;
747         if (rc) {
748                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
749                 return smc_connect_decline_fallback(smc, rc);
750         }
751
752         smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
753         return 0;
754 }
755
756 static void smc_connect_work(struct work_struct *work)
757 {
758         struct smc_sock *smc = container_of(work, struct smc_sock,
759                                             connect_work);
760         long timeo = smc->sk.sk_sndtimeo;
761         int rc = 0;
762
763         if (!timeo)
764                 timeo = MAX_SCHEDULE_TIMEOUT;
765         lock_sock(smc->clcsock->sk);
766         if (smc->clcsock->sk->sk_err) {
767                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
768         } else if ((1 << smc->clcsock->sk->sk_state) &
769                                         (TCPF_SYN_SENT | TCP_SYN_RECV)) {
770                 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
771                 if ((rc == -EPIPE) &&
772                     ((1 << smc->clcsock->sk->sk_state) &
773                                         (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
774                         rc = 0;
775         }
776         release_sock(smc->clcsock->sk);
777         lock_sock(&smc->sk);
778         if (rc != 0 || smc->sk.sk_err) {
779                 smc->sk.sk_state = SMC_CLOSED;
780                 if (rc == -EPIPE || rc == -EAGAIN)
781                         smc->sk.sk_err = EPIPE;
782                 else if (signal_pending(current))
783                         smc->sk.sk_err = -sock_intr_errno(timeo);
784                 goto out;
785         }
786
787         rc = __smc_connect(smc);
788         if (rc < 0)
789                 smc->sk.sk_err = -rc;
790
791 out:
792         if (!sock_flag(&smc->sk, SOCK_DEAD)) {
793                 if (smc->sk.sk_err) {
794                         smc->sk.sk_state_change(&smc->sk);
795                 } else { /* allow polling before and after fallback decision */
796                         smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
797                         smc->sk.sk_write_space(&smc->sk);
798                 }
799         }
800         release_sock(&smc->sk);
801 }
802
803 static int smc_connect(struct socket *sock, struct sockaddr *addr,
804                        int alen, int flags)
805 {
806         struct sock *sk = sock->sk;
807         struct smc_sock *smc;
808         int rc = -EINVAL;
809
810         smc = smc_sk(sk);
811
812         /* separate smc parameter checking to be safe */
813         if (alen < sizeof(addr->sa_family))
814                 goto out_err;
815         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
816                 goto out_err;
817
818         lock_sock(sk);
819         switch (sk->sk_state) {
820         default:
821                 goto out;
822         case SMC_ACTIVE:
823                 rc = -EISCONN;
824                 goto out;
825         case SMC_INIT:
826                 rc = 0;
827                 break;
828         }
829
830         smc_copy_sock_settings_to_clc(smc);
831         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
832         if (smc->connect_nonblock) {
833                 rc = -EALREADY;
834                 goto out;
835         }
836         rc = kernel_connect(smc->clcsock, addr, alen, flags);
837         if (rc && rc != -EINPROGRESS)
838                 goto out;
839         if (flags & O_NONBLOCK) {
840                 if (schedule_work(&smc->connect_work))
841                         smc->connect_nonblock = 1;
842                 rc = -EINPROGRESS;
843         } else {
844                 rc = __smc_connect(smc);
845                 if (rc < 0)
846                         goto out;
847                 else
848                         rc = 0; /* success cases including fallback */
849         }
850
851 out:
852         release_sock(sk);
853 out_err:
854         return rc;
855 }
856
857 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
858 {
859         struct socket *new_clcsock = NULL;
860         struct sock *lsk = &lsmc->sk;
861         struct sock *new_sk;
862         int rc = -EINVAL;
863
864         release_sock(lsk);
865         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
866         if (!new_sk) {
867                 rc = -ENOMEM;
868                 lsk->sk_err = ENOMEM;
869                 *new_smc = NULL;
870                 lock_sock(lsk);
871                 goto out;
872         }
873         *new_smc = smc_sk(new_sk);
874
875         mutex_lock(&lsmc->clcsock_release_lock);
876         if (lsmc->clcsock)
877                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
878         mutex_unlock(&lsmc->clcsock_release_lock);
879         lock_sock(lsk);
880         if  (rc < 0)
881                 lsk->sk_err = -rc;
882         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
883                 new_sk->sk_prot->unhash(new_sk);
884                 if (new_clcsock)
885                         sock_release(new_clcsock);
886                 new_sk->sk_state = SMC_CLOSED;
887                 sock_set_flag(new_sk, SOCK_DEAD);
888                 sock_put(new_sk); /* final */
889                 *new_smc = NULL;
890                 goto out;
891         }
892
893         (*new_smc)->clcsock = new_clcsock;
894 out:
895         return rc;
896 }
897
898 /* add a just created sock to the accept queue of the listen sock as
899  * candidate for a following socket accept call from user space
900  */
901 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
902 {
903         struct smc_sock *par = smc_sk(parent);
904
905         sock_hold(sk); /* sock_put in smc_accept_unlink () */
906         spin_lock(&par->accept_q_lock);
907         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
908         spin_unlock(&par->accept_q_lock);
909         sk_acceptq_added(parent);
910 }
911
912 /* remove a socket from the accept queue of its parental listening socket */
913 static void smc_accept_unlink(struct sock *sk)
914 {
915         struct smc_sock *par = smc_sk(sk)->listen_smc;
916
917         spin_lock(&par->accept_q_lock);
918         list_del_init(&smc_sk(sk)->accept_q);
919         spin_unlock(&par->accept_q_lock);
920         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
921         sock_put(sk); /* sock_hold in smc_accept_enqueue */
922 }
923
924 /* remove a sock from the accept queue to bind it to a new socket created
925  * for a socket accept call from user space
926  */
927 struct sock *smc_accept_dequeue(struct sock *parent,
928                                 struct socket *new_sock)
929 {
930         struct smc_sock *isk, *n;
931         struct sock *new_sk;
932
933         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
934                 new_sk = (struct sock *)isk;
935
936                 smc_accept_unlink(new_sk);
937                 if (new_sk->sk_state == SMC_CLOSED) {
938                         new_sk->sk_prot->unhash(new_sk);
939                         if (isk->clcsock) {
940                                 sock_release(isk->clcsock);
941                                 isk->clcsock = NULL;
942                         }
943                         sock_put(new_sk); /* final */
944                         continue;
945                 }
946                 if (new_sock) {
947                         sock_graft(new_sk, new_sock);
948                         if (isk->use_fallback) {
949                                 smc_sk(new_sk)->clcsock->file = new_sock->file;
950                                 isk->clcsock->file->private_data = isk->clcsock;
951                         }
952                 }
953                 return new_sk;
954         }
955         return NULL;
956 }
957
958 /* clean up for a created but never accepted sock */
959 void smc_close_non_accepted(struct sock *sk)
960 {
961         struct smc_sock *smc = smc_sk(sk);
962
963         lock_sock(sk);
964         if (!sk->sk_lingertime)
965                 /* wait for peer closing */
966                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
967         if (!smc->use_fallback) {
968                 smc_close_active(smc);
969                 sock_set_flag(sk, SOCK_DEAD);
970                 sk->sk_shutdown |= SHUTDOWN_MASK;
971         }
972         sk->sk_prot->unhash(sk);
973         if (smc->clcsock) {
974                 struct socket *tcp;
975
976                 tcp = smc->clcsock;
977                 smc->clcsock = NULL;
978                 sock_release(tcp);
979         }
980         if (smc->use_fallback) {
981                 sock_put(sk); /* passive closing */
982                 sk->sk_state = SMC_CLOSED;
983         } else {
984                 if (sk->sk_state == SMC_CLOSED)
985                         smc_conn_free(&smc->conn);
986         }
987         release_sock(sk);
988         sock_put(sk); /* final sock_put */
989 }
990
991 static int smc_serv_conf_first_link(struct smc_sock *smc)
992 {
993         struct net *net = sock_net(smc->clcsock->sk);
994         struct smc_link_group *lgr = smc->conn.lgr;
995         struct smc_link *link;
996         int rest;
997         int rc;
998
999         link = &lgr->lnk[SMC_SINGLE_LINK];
1000
1001         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
1002                 return SMC_CLC_DECL_ERR_REGRMB;
1003
1004         /* send CONFIRM LINK request to client over the RoCE fabric */
1005         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1006         if (rc < 0)
1007                 return SMC_CLC_DECL_TIMEOUT_CL;
1008
1009         /* receive CONFIRM LINK response from client over the RoCE fabric */
1010         rest = wait_for_completion_interruptible_timeout(
1011                 &link->llc_confirm_resp,
1012                 SMC_LLC_WAIT_FIRST_TIME);
1013         if (rest <= 0) {
1014                 struct smc_clc_msg_decline dclc;
1015
1016                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1017                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1018                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1019         }
1020
1021         if (link->llc_confirm_resp_rc)
1022                 return SMC_CLC_DECL_RMBE_EC;
1023
1024         /* send ADD LINK request to client over the RoCE fabric */
1025         rc = smc_llc_send_add_link(link,
1026                                    link->smcibdev->mac[link->ibport - 1],
1027                                    link->gid, SMC_LLC_REQ);
1028         if (rc < 0)
1029                 return SMC_CLC_DECL_TIMEOUT_AL;
1030
1031         /* receive ADD LINK response from client over the RoCE fabric */
1032         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1033                                                          SMC_LLC_WAIT_TIME);
1034         if (rest <= 0) {
1035                 struct smc_clc_msg_decline dclc;
1036
1037                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1038                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1039                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1040         }
1041
1042         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1043
1044         return 0;
1045 }
1046
1047 /* listen worker: finish */
1048 static void smc_listen_out(struct smc_sock *new_smc)
1049 {
1050         struct smc_sock *lsmc = new_smc->listen_smc;
1051         struct sock *newsmcsk = &new_smc->sk;
1052
1053         if (lsmc->sk.sk_state == SMC_LISTEN) {
1054                 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1055                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1056                 release_sock(&lsmc->sk);
1057         } else { /* no longer listening */
1058                 smc_close_non_accepted(newsmcsk);
1059         }
1060
1061         /* Wake up accept */
1062         lsmc->sk.sk_data_ready(&lsmc->sk);
1063         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1064 }
1065
1066 /* listen worker: finish in state connected */
1067 static void smc_listen_out_connected(struct smc_sock *new_smc)
1068 {
1069         struct sock *newsmcsk = &new_smc->sk;
1070
1071         sk_refcnt_debug_inc(newsmcsk);
1072         if (newsmcsk->sk_state == SMC_INIT)
1073                 newsmcsk->sk_state = SMC_ACTIVE;
1074
1075         smc_listen_out(new_smc);
1076 }
1077
1078 /* listen worker: finish in error state */
1079 static void smc_listen_out_err(struct smc_sock *new_smc)
1080 {
1081         struct sock *newsmcsk = &new_smc->sk;
1082
1083         if (newsmcsk->sk_state == SMC_INIT)
1084                 sock_put(&new_smc->sk); /* passive closing */
1085         newsmcsk->sk_state = SMC_CLOSED;
1086         smc_conn_free(&new_smc->conn);
1087
1088         smc_listen_out(new_smc);
1089 }
1090
1091 /* listen worker: decline and fall back if possible */
1092 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1093                                int local_contact)
1094 {
1095         /* RDMA setup failed, switch back to TCP */
1096         if (local_contact == SMC_FIRST_CONTACT)
1097                 smc_lgr_forget(new_smc->conn.lgr);
1098         if (reason_code < 0) { /* error, no fallback possible */
1099                 smc_listen_out_err(new_smc);
1100                 return;
1101         }
1102         smc_conn_free(&new_smc->conn);
1103         smc_switch_to_fallback(new_smc);
1104         new_smc->fallback_rsn = reason_code;
1105         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1106                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1107                         smc_listen_out_err(new_smc);
1108                         return;
1109                 }
1110         }
1111         smc_listen_out_connected(new_smc);
1112 }
1113
1114 /* listen worker: check prefixes */
1115 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1116                                  struct smc_clc_msg_proposal *pclc)
1117 {
1118         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1119         struct socket *newclcsock = new_smc->clcsock;
1120
1121         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1122         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1123                 return SMC_CLC_DECL_DIFFPREFIX;
1124
1125         return 0;
1126 }
1127
1128 /* listen worker: initialize connection and buffers */
1129 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1130                                 struct smc_init_info *ini)
1131 {
1132         int rc;
1133
1134         /* allocate connection / link group */
1135         rc = smc_conn_create(new_smc, ini);
1136         if (rc)
1137                 return rc;
1138
1139         /* create send buffer and rmb */
1140         if (smc_buf_create(new_smc, false))
1141                 return SMC_CLC_DECL_MEM;
1142
1143         return 0;
1144 }
1145
1146 /* listen worker: initialize connection and buffers for SMC-D */
1147 static int smc_listen_ism_init(struct smc_sock *new_smc,
1148                                struct smc_clc_msg_proposal *pclc,
1149                                struct smc_init_info *ini)
1150 {
1151         struct smc_clc_msg_smcd *pclc_smcd;
1152         int rc;
1153
1154         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1155         ini->ism_gid = pclc_smcd->gid;
1156         rc = smc_conn_create(new_smc, ini);
1157         if (rc)
1158                 return rc;
1159
1160         /* Check if peer can be reached via ISM device */
1161         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1162                             new_smc->conn.lgr->vlan_id,
1163                             new_smc->conn.lgr->smcd)) {
1164                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1165                         smc_lgr_forget(new_smc->conn.lgr);
1166                 smc_conn_free(&new_smc->conn);
1167                 return SMC_CLC_DECL_SMCDNOTALK;
1168         }
1169
1170         /* Create send and receive buffers */
1171         if (smc_buf_create(new_smc, true)) {
1172                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1173                         smc_lgr_forget(new_smc->conn.lgr);
1174                 smc_conn_free(&new_smc->conn);
1175                 return SMC_CLC_DECL_MEM;
1176         }
1177
1178         return 0;
1179 }
1180
1181 /* listen worker: register buffers */
1182 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1183 {
1184         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1185
1186         if (local_contact != SMC_FIRST_CONTACT) {
1187                 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1188                         return SMC_CLC_DECL_ERR_REGRMB;
1189         }
1190         smc_rmb_sync_sg_for_device(&new_smc->conn);
1191
1192         return 0;
1193 }
1194
1195 /* listen worker: finish RDMA setup */
1196 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1197                                   struct smc_clc_msg_accept_confirm *cclc,
1198                                   int local_contact)
1199 {
1200         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1201         int reason_code = 0;
1202
1203         if (local_contact == SMC_FIRST_CONTACT)
1204                 smc_link_save_peer_info(link, cclc);
1205
1206         if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1207                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1208                 goto decline;
1209         }
1210
1211         if (local_contact == SMC_FIRST_CONTACT) {
1212                 if (smc_ib_ready_link(link)) {
1213                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1214                         goto decline;
1215                 }
1216                 /* QP confirmation over RoCE fabric */
1217                 reason_code = smc_serv_conf_first_link(new_smc);
1218                 if (reason_code)
1219                         goto decline;
1220         }
1221         return 0;
1222
1223 decline:
1224         smc_listen_decline(new_smc, reason_code, local_contact);
1225         return reason_code;
1226 }
1227
1228 /* setup for RDMA connection of server */
1229 static void smc_listen_work(struct work_struct *work)
1230 {
1231         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1232                                                 smc_listen_work);
1233         struct socket *newclcsock = new_smc->clcsock;
1234         struct smc_clc_msg_accept_confirm cclc;
1235         struct smc_clc_msg_proposal *pclc;
1236         struct smc_init_info ini = {0};
1237         bool ism_supported = false;
1238         u8 buf[SMC_CLC_MAX_LEN];
1239         int rc = 0;
1240
1241         if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1242                 return smc_listen_out_err(new_smc);
1243
1244         if (new_smc->use_fallback) {
1245                 smc_listen_out_connected(new_smc);
1246                 return;
1247         }
1248
1249         /* check if peer is smc capable */
1250         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1251                 smc_switch_to_fallback(new_smc);
1252                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1253                 smc_listen_out_connected(new_smc);
1254                 return;
1255         }
1256
1257         /* do inband token exchange -
1258          * wait for and receive SMC Proposal CLC message
1259          */
1260         pclc = (struct smc_clc_msg_proposal *)&buf;
1261         rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1262                               SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1263         if (rc)
1264                 goto out_decl;
1265
1266         /* IPSec connections opt out of SMC-R optimizations */
1267         if (using_ipsec(new_smc)) {
1268                 rc = SMC_CLC_DECL_IPSEC;
1269                 goto out_decl;
1270         }
1271
1272         /* check for matching IP prefix and subnet length */
1273         rc = smc_listen_prfx_check(new_smc, pclc);
1274         if (rc)
1275                 goto out_decl;
1276
1277         /* get vlan id from IP device */
1278         if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1279                 rc = SMC_CLC_DECL_GETVLANERR;
1280                 goto out_decl;
1281         }
1282
1283         mutex_lock(&smc_server_lgr_pending);
1284         smc_close_init(new_smc);
1285         smc_rx_init(new_smc);
1286         smc_tx_init(new_smc);
1287
1288         /* check if ISM is available */
1289         if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1290                 ini.is_smcd = true; /* prepare ISM check */
1291                 rc = smc_find_ism_device(new_smc, &ini);
1292                 if (!rc)
1293                         rc = smc_listen_ism_init(new_smc, pclc, &ini);
1294                 if (!rc)
1295                         ism_supported = true;
1296                 else if (pclc->hdr.path == SMC_TYPE_D)
1297                         goto out_unlock; /* skip RDMA and decline */
1298         }
1299
1300         /* check if RDMA is available */
1301         if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1302                 /* prepare RDMA check */
1303                 memset(&ini, 0, sizeof(ini));
1304                 ini.is_smcd = false;
1305                 ini.ib_lcl = &pclc->lcl;
1306                 rc = smc_find_rdma_device(new_smc, &ini);
1307                 if (rc) {
1308                         /* no RDMA device found */
1309                         if (pclc->hdr.path == SMC_TYPE_B)
1310                                 /* neither ISM nor RDMA device found */
1311                                 rc = SMC_CLC_DECL_NOSMCDEV;
1312                         goto out_unlock;
1313                 }
1314                 rc = smc_listen_rdma_init(new_smc, &ini);
1315                 if (rc)
1316                         goto out_unlock;
1317                 rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1318                 if (rc)
1319                         goto out_unlock;
1320         }
1321
1322         /* send SMC Accept CLC message */
1323         rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1324         if (rc)
1325                 goto out_unlock;
1326
1327         /* SMC-D does not need this lock any more */
1328         if (ism_supported)
1329                 mutex_unlock(&smc_server_lgr_pending);
1330
1331         /* receive SMC Confirm CLC message */
1332         rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1333                               SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1334         if (rc) {
1335                 if (!ism_supported)
1336                         goto out_unlock;
1337                 goto out_decl;
1338         }
1339
1340         /* finish worker */
1341         if (!ism_supported) {
1342                 rc = smc_listen_rdma_finish(new_smc, &cclc,
1343                                             ini.cln_first_contact);
1344                 mutex_unlock(&smc_server_lgr_pending);
1345                 if (rc)
1346                         return;
1347         }
1348         smc_conn_save_peer_info(new_smc, &cclc);
1349         smc_listen_out_connected(new_smc);
1350         return;
1351
1352 out_unlock:
1353         mutex_unlock(&smc_server_lgr_pending);
1354 out_decl:
1355         smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1356 }
1357
1358 static void smc_tcp_listen_work(struct work_struct *work)
1359 {
1360         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1361                                              tcp_listen_work);
1362         struct sock *lsk = &lsmc->sk;
1363         struct smc_sock *new_smc;
1364         int rc = 0;
1365
1366         lock_sock(lsk);
1367         while (lsk->sk_state == SMC_LISTEN) {
1368                 rc = smc_clcsock_accept(lsmc, &new_smc);
1369                 if (rc)
1370                         goto out;
1371                 if (!new_smc)
1372                         continue;
1373
1374                 new_smc->listen_smc = lsmc;
1375                 new_smc->use_fallback = lsmc->use_fallback;
1376                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1377                 sock_hold(lsk); /* sock_put in smc_listen_work */
1378                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1379                 smc_copy_sock_settings_to_smc(new_smc);
1380                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1381                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1382                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1383                 if (!schedule_work(&new_smc->smc_listen_work))
1384                         sock_put(&new_smc->sk);
1385         }
1386
1387 out:
1388         release_sock(lsk);
1389         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1390 }
1391
1392 static int smc_listen(struct socket *sock, int backlog)
1393 {
1394         struct sock *sk = sock->sk;
1395         struct smc_sock *smc;
1396         int rc;
1397
1398         smc = smc_sk(sk);
1399         lock_sock(sk);
1400
1401         rc = -EINVAL;
1402         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1403                 goto out;
1404
1405         rc = 0;
1406         if (sk->sk_state == SMC_LISTEN) {
1407                 sk->sk_max_ack_backlog = backlog;
1408                 goto out;
1409         }
1410         /* some socket options are handled in core, so we could not apply
1411          * them to the clc socket -- copy smc socket options to clc socket
1412          */
1413         smc_copy_sock_settings_to_clc(smc);
1414         if (!smc->use_fallback)
1415                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1416
1417         rc = kernel_listen(smc->clcsock, backlog);
1418         if (rc)
1419                 goto out;
1420         sk->sk_max_ack_backlog = backlog;
1421         sk->sk_ack_backlog = 0;
1422         sk->sk_state = SMC_LISTEN;
1423         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1424         if (!schedule_work(&smc->tcp_listen_work))
1425                 sock_put(sk);
1426
1427 out:
1428         release_sock(sk);
1429         return rc;
1430 }
1431
1432 static int smc_accept(struct socket *sock, struct socket *new_sock,
1433                       int flags, bool kern)
1434 {
1435         struct sock *sk = sock->sk, *nsk;
1436         DECLARE_WAITQUEUE(wait, current);
1437         struct smc_sock *lsmc;
1438         long timeo;
1439         int rc = 0;
1440
1441         lsmc = smc_sk(sk);
1442         sock_hold(sk); /* sock_put below */
1443         lock_sock(sk);
1444
1445         if (lsmc->sk.sk_state != SMC_LISTEN) {
1446                 rc = -EINVAL;
1447                 release_sock(sk);
1448                 goto out;
1449         }
1450
1451         /* Wait for an incoming connection */
1452         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1453         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1454         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1455                 set_current_state(TASK_INTERRUPTIBLE);
1456                 if (!timeo) {
1457                         rc = -EAGAIN;
1458                         break;
1459                 }
1460                 release_sock(sk);
1461                 timeo = schedule_timeout(timeo);
1462                 /* wakeup by sk_data_ready in smc_listen_work() */
1463                 sched_annotate_sleep();
1464                 lock_sock(sk);
1465                 if (signal_pending(current)) {
1466                         rc = sock_intr_errno(timeo);
1467                         break;
1468                 }
1469         }
1470         set_current_state(TASK_RUNNING);
1471         remove_wait_queue(sk_sleep(sk), &wait);
1472
1473         if (!rc)
1474                 rc = sock_error(nsk);
1475         release_sock(sk);
1476         if (rc)
1477                 goto out;
1478
1479         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1480                 /* wait till data arrives on the socket */
1481                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1482                                                                 MSEC_PER_SEC);
1483                 if (smc_sk(nsk)->use_fallback) {
1484                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1485
1486                         lock_sock(clcsk);
1487                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1488                                 sk_wait_data(clcsk, &timeo, NULL);
1489                         release_sock(clcsk);
1490                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1491                         lock_sock(nsk);
1492                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1493                         release_sock(nsk);
1494                 }
1495         }
1496
1497 out:
1498         sock_put(sk); /* sock_hold above */
1499         return rc;
1500 }
1501
1502 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1503                        int peer)
1504 {
1505         struct smc_sock *smc;
1506
1507         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1508             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1509                 return -ENOTCONN;
1510
1511         smc = smc_sk(sock->sk);
1512
1513         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1514 }
1515
1516 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1517 {
1518         struct sock *sk = sock->sk;
1519         struct smc_sock *smc;
1520         int rc = -EPIPE;
1521
1522         smc = smc_sk(sk);
1523         lock_sock(sk);
1524         if ((sk->sk_state != SMC_ACTIVE) &&
1525             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1526             (sk->sk_state != SMC_INIT))
1527                 goto out;
1528
1529         if (msg->msg_flags & MSG_FASTOPEN) {
1530                 if (sk->sk_state == SMC_INIT) {
1531                         smc_switch_to_fallback(smc);
1532                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1533                 } else {
1534                         rc = -EINVAL;
1535                         goto out;
1536                 }
1537         }
1538
1539         if (smc->use_fallback)
1540                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1541         else
1542                 rc = smc_tx_sendmsg(smc, msg, len);
1543 out:
1544         release_sock(sk);
1545         return rc;
1546 }
1547
1548 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1549                        int flags)
1550 {
1551         struct sock *sk = sock->sk;
1552         struct smc_sock *smc;
1553         int rc = -ENOTCONN;
1554
1555         smc = smc_sk(sk);
1556         lock_sock(sk);
1557         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1558                 /* socket was connected before, no more data to read */
1559                 rc = 0;
1560                 goto out;
1561         }
1562         if ((sk->sk_state == SMC_INIT) ||
1563             (sk->sk_state == SMC_LISTEN) ||
1564             (sk->sk_state == SMC_CLOSED))
1565                 goto out;
1566
1567         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1568                 rc = 0;
1569                 goto out;
1570         }
1571
1572         if (smc->use_fallback) {
1573                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1574         } else {
1575                 msg->msg_namelen = 0;
1576                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1577         }
1578
1579 out:
1580         release_sock(sk);
1581         return rc;
1582 }
1583
1584 static __poll_t smc_accept_poll(struct sock *parent)
1585 {
1586         struct smc_sock *isk = smc_sk(parent);
1587         __poll_t mask = 0;
1588
1589         spin_lock(&isk->accept_q_lock);
1590         if (!list_empty(&isk->accept_q))
1591                 mask = EPOLLIN | EPOLLRDNORM;
1592         spin_unlock(&isk->accept_q_lock);
1593
1594         return mask;
1595 }
1596
1597 static __poll_t smc_poll(struct file *file, struct socket *sock,
1598                              poll_table *wait)
1599 {
1600         struct sock *sk = sock->sk;
1601         struct smc_sock *smc;
1602         __poll_t mask = 0;
1603
1604         if (!sk)
1605                 return EPOLLNVAL;
1606
1607         smc = smc_sk(sock->sk);
1608         if (smc->use_fallback) {
1609                 /* delegate to CLC child sock */
1610                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1611                 sk->sk_err = smc->clcsock->sk->sk_err;
1612         } else {
1613                 if (sk->sk_state != SMC_CLOSED)
1614                         sock_poll_wait(file, sock, wait);
1615                 if (sk->sk_err)
1616                         mask |= EPOLLERR;
1617                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1618                     (sk->sk_state == SMC_CLOSED))
1619                         mask |= EPOLLHUP;
1620                 if (sk->sk_state == SMC_LISTEN) {
1621                         /* woken up by sk_data_ready in smc_listen_work() */
1622                         mask |= smc_accept_poll(sk);
1623                 } else if (smc->use_fallback) { /* as result of connect_work()*/
1624                         mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1625                                                            wait);
1626                         sk->sk_err = smc->clcsock->sk->sk_err;
1627                 } else {
1628                         if ((sk->sk_state != SMC_INIT &&
1629                              atomic_read(&smc->conn.sndbuf_space)) ||
1630                             sk->sk_shutdown & SEND_SHUTDOWN) {
1631                                 mask |= EPOLLOUT | EPOLLWRNORM;
1632                         } else {
1633                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1634                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1635                         }
1636                         if (atomic_read(&smc->conn.bytes_to_rcv))
1637                                 mask |= EPOLLIN | EPOLLRDNORM;
1638                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1639                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1640                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1641                                 mask |= EPOLLIN;
1642                         if (smc->conn.urg_state == SMC_URG_VALID)
1643                                 mask |= EPOLLPRI;
1644                 }
1645         }
1646
1647         return mask;
1648 }
1649
1650 static int smc_shutdown(struct socket *sock, int how)
1651 {
1652         struct sock *sk = sock->sk;
1653         struct smc_sock *smc;
1654         int rc = -EINVAL;
1655         int rc1 = 0;
1656
1657         smc = smc_sk(sk);
1658
1659         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1660                 return rc;
1661
1662         lock_sock(sk);
1663
1664         rc = -ENOTCONN;
1665         if ((sk->sk_state != SMC_ACTIVE) &&
1666             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1667             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1668             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1669             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1670             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1671                 goto out;
1672         if (smc->use_fallback) {
1673                 rc = kernel_sock_shutdown(smc->clcsock, how);
1674                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1675                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1676                         sk->sk_state = SMC_CLOSED;
1677                 goto out;
1678         }
1679         switch (how) {
1680         case SHUT_RDWR:         /* shutdown in both directions */
1681                 rc = smc_close_active(smc);
1682                 break;
1683         case SHUT_WR:
1684                 rc = smc_close_shutdown_write(smc);
1685                 break;
1686         case SHUT_RD:
1687                 rc = 0;
1688                 /* nothing more to do because peer is not involved */
1689                 break;
1690         }
1691         if (smc->clcsock)
1692                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1693         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1694         sk->sk_shutdown |= how + 1;
1695
1696 out:
1697         release_sock(sk);
1698         return rc ? rc : rc1;
1699 }
1700
1701 static int smc_setsockopt(struct socket *sock, int level, int optname,
1702                           char __user *optval, unsigned int optlen)
1703 {
1704         struct sock *sk = sock->sk;
1705         struct smc_sock *smc;
1706         int val, rc;
1707
1708         smc = smc_sk(sk);
1709
1710         /* generic setsockopts reaching us here always apply to the
1711          * CLC socket
1712          */
1713         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1714                                            optval, optlen);
1715         if (smc->clcsock->sk->sk_err) {
1716                 sk->sk_err = smc->clcsock->sk->sk_err;
1717                 sk->sk_error_report(sk);
1718         }
1719         if (rc)
1720                 return rc;
1721
1722         if (optlen < sizeof(int))
1723                 return -EINVAL;
1724         if (get_user(val, (int __user *)optval))
1725                 return -EFAULT;
1726
1727         lock_sock(sk);
1728         switch (optname) {
1729         case TCP_ULP:
1730         case TCP_FASTOPEN:
1731         case TCP_FASTOPEN_CONNECT:
1732         case TCP_FASTOPEN_KEY:
1733         case TCP_FASTOPEN_NO_COOKIE:
1734                 /* option not supported by SMC */
1735                 if (sk->sk_state == SMC_INIT) {
1736                         smc_switch_to_fallback(smc);
1737                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1738                 } else {
1739                         if (!smc->use_fallback)
1740                                 rc = -EINVAL;
1741                 }
1742                 break;
1743         case TCP_NODELAY:
1744                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1745                         if (val && !smc->use_fallback)
1746                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1747                                                  0);
1748                 }
1749                 break;
1750         case TCP_CORK:
1751                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1752                         if (!val && !smc->use_fallback)
1753                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1754                                                  0);
1755                 }
1756                 break;
1757         case TCP_DEFER_ACCEPT:
1758                 smc->sockopt_defer_accept = val;
1759                 break;
1760         default:
1761                 break;
1762         }
1763         release_sock(sk);
1764
1765         return rc;
1766 }
1767
1768 static int smc_getsockopt(struct socket *sock, int level, int optname,
1769                           char __user *optval, int __user *optlen)
1770 {
1771         struct smc_sock *smc;
1772
1773         smc = smc_sk(sock->sk);
1774         /* socket options apply to the CLC socket */
1775         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1776                                              optval, optlen);
1777 }
1778
1779 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1780                      unsigned long arg)
1781 {
1782         union smc_host_cursor cons, urg;
1783         struct smc_connection *conn;
1784         struct smc_sock *smc;
1785         int answ;
1786
1787         smc = smc_sk(sock->sk);
1788         conn = &smc->conn;
1789         lock_sock(&smc->sk);
1790         if (smc->use_fallback) {
1791                 if (!smc->clcsock) {
1792                         release_sock(&smc->sk);
1793                         return -EBADF;
1794                 }
1795                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1796                 release_sock(&smc->sk);
1797                 return answ;
1798         }
1799         switch (cmd) {
1800         case SIOCINQ: /* same as FIONREAD */
1801                 if (smc->sk.sk_state == SMC_LISTEN) {
1802                         release_sock(&smc->sk);
1803                         return -EINVAL;
1804                 }
1805                 if (smc->sk.sk_state == SMC_INIT ||
1806                     smc->sk.sk_state == SMC_CLOSED)
1807                         answ = 0;
1808                 else
1809                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1810                 break;
1811         case SIOCOUTQ:
1812                 /* output queue size (not send + not acked) */
1813                 if (smc->sk.sk_state == SMC_LISTEN) {
1814                         release_sock(&smc->sk);
1815                         return -EINVAL;
1816                 }
1817                 if (smc->sk.sk_state == SMC_INIT ||
1818                     smc->sk.sk_state == SMC_CLOSED)
1819                         answ = 0;
1820                 else
1821                         answ = smc->conn.sndbuf_desc->len -
1822                                         atomic_read(&smc->conn.sndbuf_space);
1823                 break;
1824         case SIOCOUTQNSD:
1825                 /* output queue size (not send only) */
1826                 if (smc->sk.sk_state == SMC_LISTEN) {
1827                         release_sock(&smc->sk);
1828                         return -EINVAL;
1829                 }
1830                 if (smc->sk.sk_state == SMC_INIT ||
1831                     smc->sk.sk_state == SMC_CLOSED)
1832                         answ = 0;
1833                 else
1834                         answ = smc_tx_prepared_sends(&smc->conn);
1835                 break;
1836         case SIOCATMARK:
1837                 if (smc->sk.sk_state == SMC_LISTEN) {
1838                         release_sock(&smc->sk);
1839                         return -EINVAL;
1840                 }
1841                 if (smc->sk.sk_state == SMC_INIT ||
1842                     smc->sk.sk_state == SMC_CLOSED) {
1843                         answ = 0;
1844                 } else {
1845                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1846                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1847                         answ = smc_curs_diff(conn->rmb_desc->len,
1848                                              &cons, &urg) == 1;
1849                 }
1850                 break;
1851         default:
1852                 release_sock(&smc->sk);
1853                 return -ENOIOCTLCMD;
1854         }
1855         release_sock(&smc->sk);
1856
1857         return put_user(answ, (int __user *)arg);
1858 }
1859
1860 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1861                             int offset, size_t size, int flags)
1862 {
1863         struct sock *sk = sock->sk;
1864         struct smc_sock *smc;
1865         int rc = -EPIPE;
1866
1867         smc = smc_sk(sk);
1868         lock_sock(sk);
1869         if (sk->sk_state != SMC_ACTIVE) {
1870                 release_sock(sk);
1871                 goto out;
1872         }
1873         release_sock(sk);
1874         if (smc->use_fallback)
1875                 rc = kernel_sendpage(smc->clcsock, page, offset,
1876                                      size, flags);
1877         else
1878                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1879
1880 out:
1881         return rc;
1882 }
1883
1884 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1885  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1886  * updates till whenever a respective page has been fully processed.
1887  * Note that subsequent recv() calls have to wait till all splice() processing
1888  * completed.
1889  */
1890 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1891                                struct pipe_inode_info *pipe, size_t len,
1892                                unsigned int flags)
1893 {
1894         struct sock *sk = sock->sk;
1895         struct smc_sock *smc;
1896         int rc = -ENOTCONN;
1897
1898         smc = smc_sk(sk);
1899         lock_sock(sk);
1900         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1901                 /* socket was connected before, no more data to read */
1902                 rc = 0;
1903                 goto out;
1904         }
1905         if (sk->sk_state == SMC_INIT ||
1906             sk->sk_state == SMC_LISTEN ||
1907             sk->sk_state == SMC_CLOSED)
1908                 goto out;
1909
1910         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1911                 rc = 0;
1912                 goto out;
1913         }
1914
1915         if (smc->use_fallback) {
1916                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1917                                                     pipe, len, flags);
1918         } else {
1919                 if (*ppos) {
1920                         rc = -ESPIPE;
1921                         goto out;
1922                 }
1923                 if (flags & SPLICE_F_NONBLOCK)
1924                         flags = MSG_DONTWAIT;
1925                 else
1926                         flags = 0;
1927                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1928         }
1929 out:
1930         release_sock(sk);
1931
1932         return rc;
1933 }
1934
1935 /* must look like tcp */
1936 static const struct proto_ops smc_sock_ops = {
1937         .family         = PF_SMC,
1938         .owner          = THIS_MODULE,
1939         .release        = smc_release,
1940         .bind           = smc_bind,
1941         .connect        = smc_connect,
1942         .socketpair     = sock_no_socketpair,
1943         .accept         = smc_accept,
1944         .getname        = smc_getname,
1945         .poll           = smc_poll,
1946         .ioctl          = smc_ioctl,
1947         .listen         = smc_listen,
1948         .shutdown       = smc_shutdown,
1949         .setsockopt     = smc_setsockopt,
1950         .getsockopt     = smc_getsockopt,
1951         .sendmsg        = smc_sendmsg,
1952         .recvmsg        = smc_recvmsg,
1953         .mmap           = sock_no_mmap,
1954         .sendpage       = smc_sendpage,
1955         .splice_read    = smc_splice_read,
1956 };
1957
1958 static int smc_create(struct net *net, struct socket *sock, int protocol,
1959                       int kern)
1960 {
1961         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1962         struct smc_sock *smc;
1963         struct sock *sk;
1964         int rc;
1965
1966         rc = -ESOCKTNOSUPPORT;
1967         if (sock->type != SOCK_STREAM)
1968                 goto out;
1969
1970         rc = -EPROTONOSUPPORT;
1971         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1972                 goto out;
1973
1974         rc = -ENOBUFS;
1975         sock->ops = &smc_sock_ops;
1976         sk = smc_sock_alloc(net, sock, protocol);
1977         if (!sk)
1978                 goto out;
1979
1980         /* create internal TCP socket for CLC handshake and fallback */
1981         smc = smc_sk(sk);
1982         smc->use_fallback = false; /* assume rdma capability first */
1983         smc->fallback_rsn = 0;
1984         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1985                               &smc->clcsock);
1986         if (rc) {
1987                 sk_common_release(sk);
1988                 goto out;
1989         }
1990         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1991         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1992
1993 out:
1994         return rc;
1995 }
1996
1997 static const struct net_proto_family smc_sock_family_ops = {
1998         .family = PF_SMC,
1999         .owner  = THIS_MODULE,
2000         .create = smc_create,
2001 };
2002
2003 unsigned int smc_net_id;
2004
2005 static __net_init int smc_net_init(struct net *net)
2006 {
2007         return smc_pnet_net_init(net);
2008 }
2009
2010 static void __net_exit smc_net_exit(struct net *net)
2011 {
2012         smc_pnet_net_exit(net);
2013 }
2014
2015 static struct pernet_operations smc_net_ops = {
2016         .init = smc_net_init,
2017         .exit = smc_net_exit,
2018         .id   = &smc_net_id,
2019         .size = sizeof(struct smc_net),
2020 };
2021
2022 static int __init smc_init(void)
2023 {
2024         int rc;
2025
2026         rc = register_pernet_subsys(&smc_net_ops);
2027         if (rc)
2028                 return rc;
2029
2030         rc = smc_pnet_init();
2031         if (rc)
2032                 return rc;
2033
2034         rc = smc_llc_init();
2035         if (rc) {
2036                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2037                 goto out_pnet;
2038         }
2039
2040         rc = smc_cdc_init();
2041         if (rc) {
2042                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2043                 goto out_pnet;
2044         }
2045
2046         rc = proto_register(&smc_proto, 1);
2047         if (rc) {
2048                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2049                 goto out_pnet;
2050         }
2051
2052         rc = proto_register(&smc_proto6, 1);
2053         if (rc) {
2054                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2055                 goto out_proto;
2056         }
2057
2058         rc = sock_register(&smc_sock_family_ops);
2059         if (rc) {
2060                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
2061                 goto out_proto6;
2062         }
2063         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2064         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2065
2066         rc = smc_ib_register_client();
2067         if (rc) {
2068                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
2069                 goto out_sock;
2070         }
2071
2072         static_branch_enable(&tcp_have_smc);
2073         return 0;
2074
2075 out_sock:
2076         sock_unregister(PF_SMC);
2077 out_proto6:
2078         proto_unregister(&smc_proto6);
2079 out_proto:
2080         proto_unregister(&smc_proto);
2081 out_pnet:
2082         smc_pnet_exit();
2083         return rc;
2084 }
2085
2086 static void __exit smc_exit(void)
2087 {
2088         smc_core_exit();
2089         static_branch_disable(&tcp_have_smc);
2090         smc_ib_unregister_client();
2091         sock_unregister(PF_SMC);
2092         proto_unregister(&smc_proto6);
2093         proto_unregister(&smc_proto);
2094         smc_pnet_exit();
2095         unregister_pernet_subsys(&smc_net_ops);
2096 }
2097
2098 module_init(smc_init);
2099 module_exit(smc_exit);
2100
2101 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2102 MODULE_DESCRIPTION("smc socket address family");
2103 MODULE_LICENSE("GPL");
2104 MODULE_ALIAS_NETPROTO(PF_SMC);