Merge tag 'pm-5.9-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
[linux-2.6-microblaze.git] / net / smc / af_smc.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rcupdate_wait.h>
29
30 #include <net/sock.h>
31 #include <net/tcp.h>
32 #include <net/smc.h>
33 #include <asm/ioctls.h>
34
35 #include <net/net_namespace.h>
36 #include <net/netns/generic.h>
37 #include "smc_netns.h"
38
39 #include "smc.h"
40 #include "smc_clc.h"
41 #include "smc_llc.h"
42 #include "smc_cdc.h"
43 #include "smc_core.h"
44 #include "smc_ib.h"
45 #include "smc_ism.h"
46 #include "smc_pnet.h"
47 #include "smc_tx.h"
48 #include "smc_rx.h"
49 #include "smc_close.h"
50
51 static DEFINE_MUTEX(smc_server_lgr_pending);    /* serialize link group
52                                                  * creation on server
53                                                  */
54 static DEFINE_MUTEX(smc_client_lgr_pending);    /* serialize link group
55                                                  * creation on client
56                                                  */
57
58 static void smc_tcp_listen_work(struct work_struct *);
59 static void smc_connect_work(struct work_struct *);
60
61 static void smc_set_keepalive(struct sock *sk, int val)
62 {
63         struct smc_sock *smc = smc_sk(sk);
64
65         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
66 }
67
68 static struct smc_hashinfo smc_v4_hashinfo = {
69         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
70 };
71
72 static struct smc_hashinfo smc_v6_hashinfo = {
73         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
74 };
75
76 int smc_hash_sk(struct sock *sk)
77 {
78         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
79         struct hlist_head *head;
80
81         head = &h->ht;
82
83         write_lock_bh(&h->lock);
84         sk_add_node(sk, head);
85         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
86         write_unlock_bh(&h->lock);
87
88         return 0;
89 }
90 EXPORT_SYMBOL_GPL(smc_hash_sk);
91
92 void smc_unhash_sk(struct sock *sk)
93 {
94         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
95
96         write_lock_bh(&h->lock);
97         if (sk_del_node_init(sk))
98                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
99         write_unlock_bh(&h->lock);
100 }
101 EXPORT_SYMBOL_GPL(smc_unhash_sk);
102
103 struct proto smc_proto = {
104         .name           = "SMC",
105         .owner          = THIS_MODULE,
106         .keepalive      = smc_set_keepalive,
107         .hash           = smc_hash_sk,
108         .unhash         = smc_unhash_sk,
109         .obj_size       = sizeof(struct smc_sock),
110         .h.smc_hash     = &smc_v4_hashinfo,
111         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
112 };
113 EXPORT_SYMBOL_GPL(smc_proto);
114
115 struct proto smc_proto6 = {
116         .name           = "SMC6",
117         .owner          = THIS_MODULE,
118         .keepalive      = smc_set_keepalive,
119         .hash           = smc_hash_sk,
120         .unhash         = smc_unhash_sk,
121         .obj_size       = sizeof(struct smc_sock),
122         .h.smc_hash     = &smc_v6_hashinfo,
123         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
124 };
125 EXPORT_SYMBOL_GPL(smc_proto6);
126
127 static void smc_restore_fallback_changes(struct smc_sock *smc)
128 {
129         if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
130                 smc->clcsock->file->private_data = smc->sk.sk_socket;
131                 smc->clcsock->file = NULL;
132         }
133 }
134
135 static int __smc_release(struct smc_sock *smc)
136 {
137         struct sock *sk = &smc->sk;
138         int rc = 0;
139
140         if (!smc->use_fallback) {
141                 rc = smc_close_active(smc);
142                 sock_set_flag(sk, SOCK_DEAD);
143                 sk->sk_shutdown |= SHUTDOWN_MASK;
144         } else {
145                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
146                         sock_put(sk); /* passive closing */
147                 if (sk->sk_state == SMC_LISTEN) {
148                         /* wake up clcsock accept */
149                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
150                 }
151                 sk->sk_state = SMC_CLOSED;
152                 sk->sk_state_change(sk);
153                 smc_restore_fallback_changes(smc);
154         }
155
156         sk->sk_prot->unhash(sk);
157
158         if (sk->sk_state == SMC_CLOSED) {
159                 if (smc->clcsock) {
160                         release_sock(sk);
161                         smc_clcsock_release(smc);
162                         lock_sock(sk);
163                 }
164                 if (!smc->use_fallback)
165                         smc_conn_free(&smc->conn);
166         }
167
168         return rc;
169 }
170
171 static int smc_release(struct socket *sock)
172 {
173         struct sock *sk = sock->sk;
174         struct smc_sock *smc;
175         int rc = 0;
176
177         if (!sk)
178                 goto out;
179
180         sock_hold(sk); /* sock_put below */
181         smc = smc_sk(sk);
182
183         /* cleanup for a dangling non-blocking connect */
184         if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
185                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
186         flush_work(&smc->connect_work);
187
188         if (sk->sk_state == SMC_LISTEN)
189                 /* smc_close_non_accepted() is called and acquires
190                  * sock lock for child sockets again
191                  */
192                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
193         else
194                 lock_sock(sk);
195
196         rc = __smc_release(smc);
197
198         /* detach socket */
199         sock_orphan(sk);
200         sock->sk = NULL;
201         release_sock(sk);
202
203         sock_put(sk); /* sock_hold above */
204         sock_put(sk); /* final sock_put */
205 out:
206         return rc;
207 }
208
209 static void smc_destruct(struct sock *sk)
210 {
211         if (sk->sk_state != SMC_CLOSED)
212                 return;
213         if (!sock_flag(sk, SOCK_DEAD))
214                 return;
215
216         sk_refcnt_debug_dec(sk);
217 }
218
219 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
220                                    int protocol)
221 {
222         struct smc_sock *smc;
223         struct proto *prot;
224         struct sock *sk;
225
226         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
227         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
228         if (!sk)
229                 return NULL;
230
231         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
232         sk->sk_state = SMC_INIT;
233         sk->sk_destruct = smc_destruct;
234         sk->sk_protocol = protocol;
235         smc = smc_sk(sk);
236         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
237         INIT_WORK(&smc->connect_work, smc_connect_work);
238         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
239         INIT_LIST_HEAD(&smc->accept_q);
240         spin_lock_init(&smc->accept_q_lock);
241         spin_lock_init(&smc->conn.send_lock);
242         sk->sk_prot->hash(sk);
243         sk_refcnt_debug_inc(sk);
244         mutex_init(&smc->clcsock_release_lock);
245
246         return sk;
247 }
248
249 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
250                     int addr_len)
251 {
252         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
253         struct sock *sk = sock->sk;
254         struct smc_sock *smc;
255         int rc;
256
257         smc = smc_sk(sk);
258
259         /* replicate tests from inet_bind(), to be safe wrt. future changes */
260         rc = -EINVAL;
261         if (addr_len < sizeof(struct sockaddr_in))
262                 goto out;
263
264         rc = -EAFNOSUPPORT;
265         if (addr->sin_family != AF_INET &&
266             addr->sin_family != AF_INET6 &&
267             addr->sin_family != AF_UNSPEC)
268                 goto out;
269         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
270         if (addr->sin_family == AF_UNSPEC &&
271             addr->sin_addr.s_addr != htonl(INADDR_ANY))
272                 goto out;
273
274         lock_sock(sk);
275
276         /* Check if socket is already active */
277         rc = -EINVAL;
278         if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
279                 goto out_rel;
280
281         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
282         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
283
284 out_rel:
285         release_sock(sk);
286 out:
287         return rc;
288 }
289
290 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
291                                    unsigned long mask)
292 {
293         /* options we don't get control via setsockopt for */
294         nsk->sk_type = osk->sk_type;
295         nsk->sk_sndbuf = osk->sk_sndbuf;
296         nsk->sk_rcvbuf = osk->sk_rcvbuf;
297         nsk->sk_sndtimeo = osk->sk_sndtimeo;
298         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
299         nsk->sk_mark = osk->sk_mark;
300         nsk->sk_priority = osk->sk_priority;
301         nsk->sk_rcvlowat = osk->sk_rcvlowat;
302         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
303         nsk->sk_err = osk->sk_err;
304
305         nsk->sk_flags &= ~mask;
306         nsk->sk_flags |= osk->sk_flags & mask;
307 }
308
309 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
310                              (1UL << SOCK_KEEPOPEN) | \
311                              (1UL << SOCK_LINGER) | \
312                              (1UL << SOCK_BROADCAST) | \
313                              (1UL << SOCK_TIMESTAMP) | \
314                              (1UL << SOCK_DBG) | \
315                              (1UL << SOCK_RCVTSTAMP) | \
316                              (1UL << SOCK_RCVTSTAMPNS) | \
317                              (1UL << SOCK_LOCALROUTE) | \
318                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
319                              (1UL << SOCK_RXQ_OVFL) | \
320                              (1UL << SOCK_WIFI_STATUS) | \
321                              (1UL << SOCK_NOFCS) | \
322                              (1UL << SOCK_FILTER_LOCKED) | \
323                              (1UL << SOCK_TSTAMP_NEW))
324 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
325  * clc socket (since smc is not called for these options from net/core)
326  */
327 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
328 {
329         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
330 }
331
332 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
333                              (1UL << SOCK_KEEPOPEN) | \
334                              (1UL << SOCK_LINGER) | \
335                              (1UL << SOCK_DBG))
336 /* copy only settings and flags relevant for smc from clc to smc socket */
337 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
338 {
339         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
340 }
341
342 /* register the new rmb on all links */
343 static int smcr_lgr_reg_rmbs(struct smc_link *link,
344                              struct smc_buf_desc *rmb_desc)
345 {
346         struct smc_link_group *lgr = link->lgr;
347         int i, rc = 0;
348
349         rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
350         if (rc)
351                 return rc;
352         /* protect against parallel smc_llc_cli_rkey_exchange() and
353          * parallel smcr_link_reg_rmb()
354          */
355         mutex_lock(&lgr->llc_conf_mutex);
356         for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
357                 if (!smc_link_active(&lgr->lnk[i]))
358                         continue;
359                 rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
360                 if (rc)
361                         goto out;
362         }
363
364         /* exchange confirm_rkey msg with peer */
365         rc = smc_llc_do_confirm_rkey(link, rmb_desc);
366         if (rc) {
367                 rc = -EFAULT;
368                 goto out;
369         }
370         rmb_desc->is_conf_rkey = true;
371 out:
372         mutex_unlock(&lgr->llc_conf_mutex);
373         smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
374         return rc;
375 }
376
377 static int smcr_clnt_conf_first_link(struct smc_sock *smc)
378 {
379         struct smc_link *link = smc->conn.lnk;
380         struct smc_llc_qentry *qentry;
381         int rc;
382
383         /* receive CONFIRM LINK request from server over RoCE fabric */
384         qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
385                               SMC_LLC_CONFIRM_LINK);
386         if (!qentry) {
387                 struct smc_clc_msg_decline dclc;
388
389                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
390                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
391                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
392         }
393         smc_llc_save_peer_uid(qentry);
394         rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
395         smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
396         if (rc)
397                 return SMC_CLC_DECL_RMBE_EC;
398
399         rc = smc_ib_modify_qp_rts(link);
400         if (rc)
401                 return SMC_CLC_DECL_ERR_RDYLNK;
402
403         smc_wr_remember_qp_attr(link);
404
405         if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
406                 return SMC_CLC_DECL_ERR_REGRMB;
407
408         /* confirm_rkey is implicit on 1st contact */
409         smc->conn.rmb_desc->is_conf_rkey = true;
410
411         /* send CONFIRM LINK response over RoCE fabric */
412         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
413         if (rc < 0)
414                 return SMC_CLC_DECL_TIMEOUT_CL;
415
416         smc_llc_link_active(link);
417         smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
418
419         /* optional 2nd link, receive ADD LINK request from server */
420         qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
421                               SMC_LLC_ADD_LINK);
422         if (!qentry) {
423                 struct smc_clc_msg_decline dclc;
424
425                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
426                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
427                 if (rc == -EAGAIN)
428                         rc = 0; /* no DECLINE received, go with one link */
429                 return rc;
430         }
431         smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
432         smc_llc_cli_add_link(link, qentry);
433         return 0;
434 }
435
436 static void smcr_conn_save_peer_info(struct smc_sock *smc,
437                                      struct smc_clc_msg_accept_confirm *clc)
438 {
439         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
440
441         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
442         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
443         smc->conn.peer_rmbe_size = bufsize;
444         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
445         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
446 }
447
448 static void smcd_conn_save_peer_info(struct smc_sock *smc,
449                                      struct smc_clc_msg_accept_confirm *clc)
450 {
451         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
452
453         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
454         smc->conn.peer_token = clc->token;
455         /* msg header takes up space in the buffer */
456         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
457         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
458         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
459 }
460
461 static void smc_conn_save_peer_info(struct smc_sock *smc,
462                                     struct smc_clc_msg_accept_confirm *clc)
463 {
464         if (smc->conn.lgr->is_smcd)
465                 smcd_conn_save_peer_info(smc, clc);
466         else
467                 smcr_conn_save_peer_info(smc, clc);
468 }
469
470 static void smc_link_save_peer_info(struct smc_link *link,
471                                     struct smc_clc_msg_accept_confirm *clc)
472 {
473         link->peer_qpn = ntoh24(clc->qpn);
474         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
475         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
476         link->peer_psn = ntoh24(clc->psn);
477         link->peer_mtu = clc->qp_mtu;
478 }
479
480 static void smc_switch_to_fallback(struct smc_sock *smc)
481 {
482         smc->use_fallback = true;
483         if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
484                 smc->clcsock->file = smc->sk.sk_socket->file;
485                 smc->clcsock->file->private_data = smc->clcsock;
486                 smc->clcsock->wq.fasync_list =
487                         smc->sk.sk_socket->wq.fasync_list;
488         }
489 }
490
491 /* fall back during connect */
492 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
493 {
494         smc_switch_to_fallback(smc);
495         smc->fallback_rsn = reason_code;
496         smc_copy_sock_settings_to_clc(smc);
497         smc->connect_nonblock = 0;
498         if (smc->sk.sk_state == SMC_INIT)
499                 smc->sk.sk_state = SMC_ACTIVE;
500         return 0;
501 }
502
503 /* decline and fall back during connect */
504 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
505 {
506         int rc;
507
508         if (reason_code < 0) { /* error, fallback is not possible */
509                 if (smc->sk.sk_state == SMC_INIT)
510                         sock_put(&smc->sk); /* passive closing */
511                 return reason_code;
512         }
513         if (reason_code != SMC_CLC_DECL_PEERDECL) {
514                 rc = smc_clc_send_decline(smc, reason_code);
515                 if (rc < 0) {
516                         if (smc->sk.sk_state == SMC_INIT)
517                                 sock_put(&smc->sk); /* passive closing */
518                         return rc;
519                 }
520         }
521         return smc_connect_fallback(smc, reason_code);
522 }
523
524 /* abort connecting */
525 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
526                              int local_contact)
527 {
528         bool is_smcd = smc->conn.lgr->is_smcd;
529
530         if (local_contact == SMC_FIRST_CONTACT)
531                 smc_lgr_cleanup_early(&smc->conn);
532         else
533                 smc_conn_free(&smc->conn);
534         if (is_smcd)
535                 /* there is only one lgr role for SMC-D; use server lock */
536                 mutex_unlock(&smc_server_lgr_pending);
537         else
538                 mutex_unlock(&smc_client_lgr_pending);
539
540         smc->connect_nonblock = 0;
541         return reason_code;
542 }
543
544 /* check if there is a rdma device available for this connection. */
545 /* called for connect and listen */
546 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
547 {
548         /* PNET table look up: search active ib_device and port
549          * within same PNETID that also contains the ethernet device
550          * used for the internal TCP socket
551          */
552         smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
553         if (!ini->ib_dev)
554                 return SMC_CLC_DECL_NOSMCRDEV;
555         return 0;
556 }
557
558 /* check if there is an ISM device available for this connection. */
559 /* called for connect and listen */
560 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
561 {
562         /* Find ISM device with same PNETID as connecting interface  */
563         smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
564         if (!ini->ism_dev)
565                 return SMC_CLC_DECL_NOSMCDDEV;
566         return 0;
567 }
568
569 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
570 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
571                                       struct smc_init_info *ini)
572 {
573         if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
574                 return SMC_CLC_DECL_ISMVLANERR;
575         return 0;
576 }
577
578 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
579  * used, the VLAN ID will be registered again during the connection setup.
580  */
581 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
582                                         struct smc_init_info *ini)
583 {
584         if (!is_smcd)
585                 return 0;
586         if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
587                 return SMC_CLC_DECL_CNFERR;
588         return 0;
589 }
590
591 /* CLC handshake during connect */
592 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
593                            struct smc_clc_msg_accept_confirm *aclc,
594                            struct smc_init_info *ini)
595 {
596         int rc = 0;
597
598         /* do inband token exchange */
599         rc = smc_clc_send_proposal(smc, smc_type, ini);
600         if (rc)
601                 return rc;
602         /* receive SMC Accept CLC message */
603         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
604                                 CLC_WAIT_TIME);
605 }
606
607 /* setup for RDMA connection of client */
608 static int smc_connect_rdma(struct smc_sock *smc,
609                             struct smc_clc_msg_accept_confirm *aclc,
610                             struct smc_init_info *ini)
611 {
612         int i, reason_code = 0;
613         struct smc_link *link;
614
615         ini->is_smcd = false;
616         ini->ib_lcl = &aclc->lcl;
617         ini->ib_clcqpn = ntoh24(aclc->qpn);
618         ini->srv_first_contact = aclc->hdr.flag;
619
620         mutex_lock(&smc_client_lgr_pending);
621         reason_code = smc_conn_create(smc, ini);
622         if (reason_code) {
623                 mutex_unlock(&smc_client_lgr_pending);
624                 return reason_code;
625         }
626
627         smc_conn_save_peer_info(smc, aclc);
628
629         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
630                 link = smc->conn.lnk;
631         } else {
632                 /* set link that was assigned by server */
633                 link = NULL;
634                 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
635                         struct smc_link *l = &smc->conn.lgr->lnk[i];
636
637                         if (l->peer_qpn == ntoh24(aclc->qpn) &&
638                             !memcmp(l->peer_gid, &aclc->lcl.gid, SMC_GID_SIZE) &&
639                             !memcmp(l->peer_mac, &aclc->lcl.mac, sizeof(l->peer_mac))) {
640                                 link = l;
641                                 break;
642                         }
643                 }
644                 if (!link)
645                         return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK,
646                                                  ini->cln_first_contact);
647                 smc->conn.lnk = link;
648         }
649
650         /* create send buffer and rmb */
651         if (smc_buf_create(smc, false))
652                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
653                                          ini->cln_first_contact);
654
655         if (ini->cln_first_contact == SMC_FIRST_CONTACT)
656                 smc_link_save_peer_info(link, aclc);
657
658         if (smc_rmb_rtoken_handling(&smc->conn, link, aclc))
659                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
660                                          ini->cln_first_contact);
661
662         smc_close_init(smc);
663         smc_rx_init(smc);
664
665         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
666                 if (smc_ib_ready_link(link))
667                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
668                                                  ini->cln_first_contact);
669         } else {
670                 if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc))
671                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
672                                                  ini->cln_first_contact);
673         }
674         smc_rmb_sync_sg_for_device(&smc->conn);
675
676         reason_code = smc_clc_send_confirm(smc);
677         if (reason_code)
678                 return smc_connect_abort(smc, reason_code,
679                                          ini->cln_first_contact);
680
681         smc_tx_init(smc);
682
683         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
684                 /* QP confirmation over RoCE fabric */
685                 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
686                 reason_code = smcr_clnt_conf_first_link(smc);
687                 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
688                 if (reason_code)
689                         return smc_connect_abort(smc, reason_code,
690                                                  ini->cln_first_contact);
691         }
692         mutex_unlock(&smc_client_lgr_pending);
693
694         smc_copy_sock_settings_to_clc(smc);
695         smc->connect_nonblock = 0;
696         if (smc->sk.sk_state == SMC_INIT)
697                 smc->sk.sk_state = SMC_ACTIVE;
698
699         return 0;
700 }
701
702 /* setup for ISM connection of client */
703 static int smc_connect_ism(struct smc_sock *smc,
704                            struct smc_clc_msg_accept_confirm *aclc,
705                            struct smc_init_info *ini)
706 {
707         int rc = 0;
708
709         ini->is_smcd = true;
710         ini->ism_gid = aclc->gid;
711         ini->srv_first_contact = aclc->hdr.flag;
712
713         /* there is only one lgr role for SMC-D; use server lock */
714         mutex_lock(&smc_server_lgr_pending);
715         rc = smc_conn_create(smc, ini);
716         if (rc) {
717                 mutex_unlock(&smc_server_lgr_pending);
718                 return rc;
719         }
720
721         /* Create send and receive buffers */
722         rc = smc_buf_create(smc, true);
723         if (rc)
724                 return smc_connect_abort(smc, (rc == -ENOSPC) ?
725                                               SMC_CLC_DECL_MAX_DMB :
726                                               SMC_CLC_DECL_MEM,
727                                          ini->cln_first_contact);
728
729         smc_conn_save_peer_info(smc, aclc);
730         smc_close_init(smc);
731         smc_rx_init(smc);
732         smc_tx_init(smc);
733
734         rc = smc_clc_send_confirm(smc);
735         if (rc)
736                 return smc_connect_abort(smc, rc, ini->cln_first_contact);
737         mutex_unlock(&smc_server_lgr_pending);
738
739         smc_copy_sock_settings_to_clc(smc);
740         smc->connect_nonblock = 0;
741         if (smc->sk.sk_state == SMC_INIT)
742                 smc->sk.sk_state = SMC_ACTIVE;
743
744         return 0;
745 }
746
747 /* perform steps before actually connecting */
748 static int __smc_connect(struct smc_sock *smc)
749 {
750         bool ism_supported = false, rdma_supported = false;
751         struct smc_clc_msg_accept_confirm aclc;
752         struct smc_init_info ini = {0};
753         int smc_type;
754         int rc = 0;
755
756         if (smc->use_fallback)
757                 return smc_connect_fallback(smc, smc->fallback_rsn);
758
759         /* if peer has not signalled SMC-capability, fall back */
760         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
761                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
762
763         /* IPSec connections opt out of SMC-R optimizations */
764         if (using_ipsec(smc))
765                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
766
767         /* get vlan id from IP device */
768         if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
769                 return smc_connect_decline_fallback(smc,
770                                                     SMC_CLC_DECL_GETVLANERR);
771
772         /* check if there is an ism device available */
773         if (!smc_find_ism_device(smc, &ini) &&
774             !smc_connect_ism_vlan_setup(smc, &ini)) {
775                 /* ISM is supported for this connection */
776                 ism_supported = true;
777                 smc_type = SMC_TYPE_D;
778         }
779
780         /* check if there is a rdma device available */
781         if (!smc_find_rdma_device(smc, &ini)) {
782                 /* RDMA is supported for this connection */
783                 rdma_supported = true;
784                 if (ism_supported)
785                         smc_type = SMC_TYPE_B; /* both */
786                 else
787                         smc_type = SMC_TYPE_R; /* only RDMA */
788         }
789
790         /* if neither ISM nor RDMA are supported, fallback */
791         if (!rdma_supported && !ism_supported)
792                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
793
794         /* perform CLC handshake */
795         rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
796         if (rc) {
797                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
798                 return smc_connect_decline_fallback(smc, rc);
799         }
800
801         /* depending on previous steps, connect using rdma or ism */
802         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
803                 rc = smc_connect_rdma(smc, &aclc, &ini);
804         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
805                 rc = smc_connect_ism(smc, &aclc, &ini);
806         else
807                 rc = SMC_CLC_DECL_MODEUNSUPP;
808         if (rc) {
809                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
810                 return smc_connect_decline_fallback(smc, rc);
811         }
812
813         smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
814         return 0;
815 }
816
817 static void smc_connect_work(struct work_struct *work)
818 {
819         struct smc_sock *smc = container_of(work, struct smc_sock,
820                                             connect_work);
821         long timeo = smc->sk.sk_sndtimeo;
822         int rc = 0;
823
824         if (!timeo)
825                 timeo = MAX_SCHEDULE_TIMEOUT;
826         lock_sock(smc->clcsock->sk);
827         if (smc->clcsock->sk->sk_err) {
828                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
829         } else if ((1 << smc->clcsock->sk->sk_state) &
830                                         (TCPF_SYN_SENT | TCP_SYN_RECV)) {
831                 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
832                 if ((rc == -EPIPE) &&
833                     ((1 << smc->clcsock->sk->sk_state) &
834                                         (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
835                         rc = 0;
836         }
837         release_sock(smc->clcsock->sk);
838         lock_sock(&smc->sk);
839         if (rc != 0 || smc->sk.sk_err) {
840                 smc->sk.sk_state = SMC_CLOSED;
841                 if (rc == -EPIPE || rc == -EAGAIN)
842                         smc->sk.sk_err = EPIPE;
843                 else if (signal_pending(current))
844                         smc->sk.sk_err = -sock_intr_errno(timeo);
845                 sock_put(&smc->sk); /* passive closing */
846                 goto out;
847         }
848
849         rc = __smc_connect(smc);
850         if (rc < 0)
851                 smc->sk.sk_err = -rc;
852
853 out:
854         if (!sock_flag(&smc->sk, SOCK_DEAD)) {
855                 if (smc->sk.sk_err) {
856                         smc->sk.sk_state_change(&smc->sk);
857                 } else { /* allow polling before and after fallback decision */
858                         smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
859                         smc->sk.sk_write_space(&smc->sk);
860                 }
861         }
862         release_sock(&smc->sk);
863 }
864
865 static int smc_connect(struct socket *sock, struct sockaddr *addr,
866                        int alen, int flags)
867 {
868         struct sock *sk = sock->sk;
869         struct smc_sock *smc;
870         int rc = -EINVAL;
871
872         smc = smc_sk(sk);
873
874         /* separate smc parameter checking to be safe */
875         if (alen < sizeof(addr->sa_family))
876                 goto out_err;
877         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
878                 goto out_err;
879
880         lock_sock(sk);
881         switch (sk->sk_state) {
882         default:
883                 goto out;
884         case SMC_ACTIVE:
885                 rc = -EISCONN;
886                 goto out;
887         case SMC_INIT:
888                 rc = 0;
889                 break;
890         }
891
892         smc_copy_sock_settings_to_clc(smc);
893         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
894         if (smc->connect_nonblock) {
895                 rc = -EALREADY;
896                 goto out;
897         }
898         rc = kernel_connect(smc->clcsock, addr, alen, flags);
899         if (rc && rc != -EINPROGRESS)
900                 goto out;
901
902         sock_hold(&smc->sk); /* sock put in passive closing */
903         if (smc->use_fallback)
904                 goto out;
905         if (flags & O_NONBLOCK) {
906                 if (schedule_work(&smc->connect_work))
907                         smc->connect_nonblock = 1;
908                 rc = -EINPROGRESS;
909         } else {
910                 rc = __smc_connect(smc);
911                 if (rc < 0)
912                         goto out;
913                 else
914                         rc = 0; /* success cases including fallback */
915         }
916
917 out:
918         release_sock(sk);
919 out_err:
920         return rc;
921 }
922
923 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
924 {
925         struct socket *new_clcsock = NULL;
926         struct sock *lsk = &lsmc->sk;
927         struct sock *new_sk;
928         int rc = -EINVAL;
929
930         release_sock(lsk);
931         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
932         if (!new_sk) {
933                 rc = -ENOMEM;
934                 lsk->sk_err = ENOMEM;
935                 *new_smc = NULL;
936                 lock_sock(lsk);
937                 goto out;
938         }
939         *new_smc = smc_sk(new_sk);
940
941         mutex_lock(&lsmc->clcsock_release_lock);
942         if (lsmc->clcsock)
943                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
944         mutex_unlock(&lsmc->clcsock_release_lock);
945         lock_sock(lsk);
946         if  (rc < 0)
947                 lsk->sk_err = -rc;
948         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
949                 new_sk->sk_prot->unhash(new_sk);
950                 if (new_clcsock)
951                         sock_release(new_clcsock);
952                 new_sk->sk_state = SMC_CLOSED;
953                 sock_set_flag(new_sk, SOCK_DEAD);
954                 sock_put(new_sk); /* final */
955                 *new_smc = NULL;
956                 goto out;
957         }
958
959         (*new_smc)->clcsock = new_clcsock;
960 out:
961         return rc;
962 }
963
964 /* add a just created sock to the accept queue of the listen sock as
965  * candidate for a following socket accept call from user space
966  */
967 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
968 {
969         struct smc_sock *par = smc_sk(parent);
970
971         sock_hold(sk); /* sock_put in smc_accept_unlink () */
972         spin_lock(&par->accept_q_lock);
973         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
974         spin_unlock(&par->accept_q_lock);
975         sk_acceptq_added(parent);
976 }
977
978 /* remove a socket from the accept queue of its parental listening socket */
979 static void smc_accept_unlink(struct sock *sk)
980 {
981         struct smc_sock *par = smc_sk(sk)->listen_smc;
982
983         spin_lock(&par->accept_q_lock);
984         list_del_init(&smc_sk(sk)->accept_q);
985         spin_unlock(&par->accept_q_lock);
986         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
987         sock_put(sk); /* sock_hold in smc_accept_enqueue */
988 }
989
990 /* remove a sock from the accept queue to bind it to a new socket created
991  * for a socket accept call from user space
992  */
993 struct sock *smc_accept_dequeue(struct sock *parent,
994                                 struct socket *new_sock)
995 {
996         struct smc_sock *isk, *n;
997         struct sock *new_sk;
998
999         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
1000                 new_sk = (struct sock *)isk;
1001
1002                 smc_accept_unlink(new_sk);
1003                 if (new_sk->sk_state == SMC_CLOSED) {
1004                         new_sk->sk_prot->unhash(new_sk);
1005                         if (isk->clcsock) {
1006                                 sock_release(isk->clcsock);
1007                                 isk->clcsock = NULL;
1008                         }
1009                         sock_put(new_sk); /* final */
1010                         continue;
1011                 }
1012                 if (new_sock) {
1013                         sock_graft(new_sk, new_sock);
1014                         if (isk->use_fallback) {
1015                                 smc_sk(new_sk)->clcsock->file = new_sock->file;
1016                                 isk->clcsock->file->private_data = isk->clcsock;
1017                         }
1018                 }
1019                 return new_sk;
1020         }
1021         return NULL;
1022 }
1023
1024 /* clean up for a created but never accepted sock */
1025 void smc_close_non_accepted(struct sock *sk)
1026 {
1027         struct smc_sock *smc = smc_sk(sk);
1028
1029         sock_hold(sk); /* sock_put below */
1030         lock_sock(sk);
1031         if (!sk->sk_lingertime)
1032                 /* wait for peer closing */
1033                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
1034         __smc_release(smc);
1035         release_sock(sk);
1036         sock_put(sk); /* sock_hold above */
1037         sock_put(sk); /* final sock_put */
1038 }
1039
1040 static int smcr_serv_conf_first_link(struct smc_sock *smc)
1041 {
1042         struct smc_link *link = smc->conn.lnk;
1043         struct smc_llc_qentry *qentry;
1044         int rc;
1045
1046         if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
1047                 return SMC_CLC_DECL_ERR_REGRMB;
1048
1049         /* send CONFIRM LINK request to client over the RoCE fabric */
1050         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1051         if (rc < 0)
1052                 return SMC_CLC_DECL_TIMEOUT_CL;
1053
1054         /* receive CONFIRM LINK response from client over the RoCE fabric */
1055         qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
1056                               SMC_LLC_CONFIRM_LINK);
1057         if (!qentry) {
1058                 struct smc_clc_msg_decline dclc;
1059
1060                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1061                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1062                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1063         }
1064         smc_llc_save_peer_uid(qentry);
1065         rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
1066         smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
1067         if (rc)
1068                 return SMC_CLC_DECL_RMBE_EC;
1069
1070         /* confirm_rkey is implicit on 1st contact */
1071         smc->conn.rmb_desc->is_conf_rkey = true;
1072
1073         smc_llc_link_active(link);
1074         smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
1075
1076         /* initial contact - try to establish second link */
1077         smc_llc_srv_add_link(link);
1078         return 0;
1079 }
1080
1081 /* listen worker: finish */
1082 static void smc_listen_out(struct smc_sock *new_smc)
1083 {
1084         struct smc_sock *lsmc = new_smc->listen_smc;
1085         struct sock *newsmcsk = &new_smc->sk;
1086
1087         if (lsmc->sk.sk_state == SMC_LISTEN) {
1088                 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1089                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1090                 release_sock(&lsmc->sk);
1091         } else { /* no longer listening */
1092                 smc_close_non_accepted(newsmcsk);
1093         }
1094
1095         /* Wake up accept */
1096         lsmc->sk.sk_data_ready(&lsmc->sk);
1097         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1098 }
1099
1100 /* listen worker: finish in state connected */
1101 static void smc_listen_out_connected(struct smc_sock *new_smc)
1102 {
1103         struct sock *newsmcsk = &new_smc->sk;
1104
1105         sk_refcnt_debug_inc(newsmcsk);
1106         if (newsmcsk->sk_state == SMC_INIT)
1107                 newsmcsk->sk_state = SMC_ACTIVE;
1108
1109         smc_listen_out(new_smc);
1110 }
1111
1112 /* listen worker: finish in error state */
1113 static void smc_listen_out_err(struct smc_sock *new_smc)
1114 {
1115         struct sock *newsmcsk = &new_smc->sk;
1116
1117         if (newsmcsk->sk_state == SMC_INIT)
1118                 sock_put(&new_smc->sk); /* passive closing */
1119         newsmcsk->sk_state = SMC_CLOSED;
1120
1121         smc_listen_out(new_smc);
1122 }
1123
1124 /* listen worker: decline and fall back if possible */
1125 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1126                                int local_contact)
1127 {
1128         /* RDMA setup failed, switch back to TCP */
1129         if (local_contact == SMC_FIRST_CONTACT)
1130                 smc_lgr_cleanup_early(&new_smc->conn);
1131         else
1132                 smc_conn_free(&new_smc->conn);
1133         if (reason_code < 0) { /* error, no fallback possible */
1134                 smc_listen_out_err(new_smc);
1135                 return;
1136         }
1137         smc_switch_to_fallback(new_smc);
1138         new_smc->fallback_rsn = reason_code;
1139         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1140                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1141                         smc_listen_out_err(new_smc);
1142                         return;
1143                 }
1144         }
1145         smc_listen_out_connected(new_smc);
1146 }
1147
1148 /* listen worker: check prefixes */
1149 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1150                                  struct smc_clc_msg_proposal *pclc)
1151 {
1152         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1153         struct socket *newclcsock = new_smc->clcsock;
1154
1155         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1156         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1157                 return SMC_CLC_DECL_DIFFPREFIX;
1158
1159         return 0;
1160 }
1161
1162 /* listen worker: initialize connection and buffers */
1163 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1164                                 struct smc_init_info *ini)
1165 {
1166         int rc;
1167
1168         /* allocate connection / link group */
1169         rc = smc_conn_create(new_smc, ini);
1170         if (rc)
1171                 return rc;
1172
1173         /* create send buffer and rmb */
1174         if (smc_buf_create(new_smc, false))
1175                 return SMC_CLC_DECL_MEM;
1176
1177         return 0;
1178 }
1179
1180 /* listen worker: initialize connection and buffers for SMC-D */
1181 static int smc_listen_ism_init(struct smc_sock *new_smc,
1182                                struct smc_clc_msg_proposal *pclc,
1183                                struct smc_init_info *ini)
1184 {
1185         struct smc_clc_msg_smcd *pclc_smcd;
1186         int rc;
1187
1188         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1189         ini->ism_gid = pclc_smcd->gid;
1190         rc = smc_conn_create(new_smc, ini);
1191         if (rc)
1192                 return rc;
1193
1194         /* Check if peer can be reached via ISM device */
1195         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1196                             new_smc->conn.lgr->vlan_id,
1197                             new_smc->conn.lgr->smcd)) {
1198                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1199                         smc_lgr_cleanup_early(&new_smc->conn);
1200                 else
1201                         smc_conn_free(&new_smc->conn);
1202                 return SMC_CLC_DECL_SMCDNOTALK;
1203         }
1204
1205         /* Create send and receive buffers */
1206         rc = smc_buf_create(new_smc, true);
1207         if (rc) {
1208                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1209                         smc_lgr_cleanup_early(&new_smc->conn);
1210                 else
1211                         smc_conn_free(&new_smc->conn);
1212                 return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
1213                                          SMC_CLC_DECL_MEM;
1214         }
1215
1216         return 0;
1217 }
1218
1219 /* listen worker: register buffers */
1220 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1221 {
1222         struct smc_connection *conn = &new_smc->conn;
1223
1224         if (local_contact != SMC_FIRST_CONTACT) {
1225                 if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
1226                         return SMC_CLC_DECL_ERR_REGRMB;
1227         }
1228         smc_rmb_sync_sg_for_device(&new_smc->conn);
1229
1230         return 0;
1231 }
1232
1233 /* listen worker: finish RDMA setup */
1234 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1235                                   struct smc_clc_msg_accept_confirm *cclc,
1236                                   int local_contact)
1237 {
1238         struct smc_link *link = new_smc->conn.lnk;
1239         int reason_code = 0;
1240
1241         if (local_contact == SMC_FIRST_CONTACT)
1242                 smc_link_save_peer_info(link, cclc);
1243
1244         if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) {
1245                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1246                 goto decline;
1247         }
1248
1249         if (local_contact == SMC_FIRST_CONTACT) {
1250                 if (smc_ib_ready_link(link)) {
1251                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1252                         goto decline;
1253                 }
1254                 /* QP confirmation over RoCE fabric */
1255                 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
1256                 reason_code = smcr_serv_conf_first_link(new_smc);
1257                 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
1258                 if (reason_code)
1259                         goto decline;
1260         }
1261         return 0;
1262
1263 decline:
1264         smc_listen_decline(new_smc, reason_code, local_contact);
1265         return reason_code;
1266 }
1267
1268 /* setup for RDMA connection of server */
1269 static void smc_listen_work(struct work_struct *work)
1270 {
1271         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1272                                                 smc_listen_work);
1273         struct socket *newclcsock = new_smc->clcsock;
1274         struct smc_clc_msg_accept_confirm cclc;
1275         struct smc_clc_msg_proposal *pclc;
1276         struct smc_init_info ini = {0};
1277         bool ism_supported = false;
1278         u8 buf[SMC_CLC_MAX_LEN];
1279         int rc = 0;
1280
1281         if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1282                 return smc_listen_out_err(new_smc);
1283
1284         if (new_smc->use_fallback) {
1285                 smc_listen_out_connected(new_smc);
1286                 return;
1287         }
1288
1289         /* check if peer is smc capable */
1290         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1291                 smc_switch_to_fallback(new_smc);
1292                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1293                 smc_listen_out_connected(new_smc);
1294                 return;
1295         }
1296
1297         /* do inband token exchange -
1298          * wait for and receive SMC Proposal CLC message
1299          */
1300         pclc = (struct smc_clc_msg_proposal *)&buf;
1301         rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1302                               SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1303         if (rc)
1304                 goto out_decl;
1305
1306         /* IPSec connections opt out of SMC-R optimizations */
1307         if (using_ipsec(new_smc)) {
1308                 rc = SMC_CLC_DECL_IPSEC;
1309                 goto out_decl;
1310         }
1311
1312         /* check for matching IP prefix and subnet length */
1313         rc = smc_listen_prfx_check(new_smc, pclc);
1314         if (rc)
1315                 goto out_decl;
1316
1317         /* get vlan id from IP device */
1318         if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1319                 rc = SMC_CLC_DECL_GETVLANERR;
1320                 goto out_decl;
1321         }
1322
1323         mutex_lock(&smc_server_lgr_pending);
1324         smc_close_init(new_smc);
1325         smc_rx_init(new_smc);
1326         smc_tx_init(new_smc);
1327
1328         /* check if ISM is available */
1329         if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1330                 ini.is_smcd = true; /* prepare ISM check */
1331                 rc = smc_find_ism_device(new_smc, &ini);
1332                 if (!rc)
1333                         rc = smc_listen_ism_init(new_smc, pclc, &ini);
1334                 if (!rc)
1335                         ism_supported = true;
1336                 else if (pclc->hdr.path == SMC_TYPE_D)
1337                         goto out_unlock; /* skip RDMA and decline */
1338         }
1339
1340         /* check if RDMA is available */
1341         if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1342                 /* prepare RDMA check */
1343                 ini.is_smcd = false;
1344                 ini.ism_dev = NULL;
1345                 ini.ib_lcl = &pclc->lcl;
1346                 rc = smc_find_rdma_device(new_smc, &ini);
1347                 if (rc) {
1348                         /* no RDMA device found */
1349                         if (pclc->hdr.path == SMC_TYPE_B)
1350                                 /* neither ISM nor RDMA device found */
1351                                 rc = SMC_CLC_DECL_NOSMCDEV;
1352                         goto out_unlock;
1353                 }
1354                 rc = smc_listen_rdma_init(new_smc, &ini);
1355                 if (rc)
1356                         goto out_unlock;
1357                 rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1358                 if (rc)
1359                         goto out_unlock;
1360         }
1361
1362         /* send SMC Accept CLC message */
1363         rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1364         if (rc)
1365                 goto out_unlock;
1366
1367         /* SMC-D does not need this lock any more */
1368         if (ism_supported)
1369                 mutex_unlock(&smc_server_lgr_pending);
1370
1371         /* receive SMC Confirm CLC message */
1372         rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1373                               SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1374         if (rc) {
1375                 if (!ism_supported)
1376                         goto out_unlock;
1377                 goto out_decl;
1378         }
1379
1380         /* finish worker */
1381         if (!ism_supported) {
1382                 rc = smc_listen_rdma_finish(new_smc, &cclc,
1383                                             ini.cln_first_contact);
1384                 mutex_unlock(&smc_server_lgr_pending);
1385                 if (rc)
1386                         return;
1387         }
1388         smc_conn_save_peer_info(new_smc, &cclc);
1389         smc_listen_out_connected(new_smc);
1390         return;
1391
1392 out_unlock:
1393         mutex_unlock(&smc_server_lgr_pending);
1394 out_decl:
1395         smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1396 }
1397
1398 static void smc_tcp_listen_work(struct work_struct *work)
1399 {
1400         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1401                                              tcp_listen_work);
1402         struct sock *lsk = &lsmc->sk;
1403         struct smc_sock *new_smc;
1404         int rc = 0;
1405
1406         lock_sock(lsk);
1407         while (lsk->sk_state == SMC_LISTEN) {
1408                 rc = smc_clcsock_accept(lsmc, &new_smc);
1409                 if (rc)
1410                         goto out;
1411                 if (!new_smc)
1412                         continue;
1413
1414                 new_smc->listen_smc = lsmc;
1415                 new_smc->use_fallback = lsmc->use_fallback;
1416                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1417                 sock_hold(lsk); /* sock_put in smc_listen_work */
1418                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1419                 smc_copy_sock_settings_to_smc(new_smc);
1420                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1421                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1422                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1423                 if (!schedule_work(&new_smc->smc_listen_work))
1424                         sock_put(&new_smc->sk);
1425         }
1426
1427 out:
1428         release_sock(lsk);
1429         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1430 }
1431
1432 static int smc_listen(struct socket *sock, int backlog)
1433 {
1434         struct sock *sk = sock->sk;
1435         struct smc_sock *smc;
1436         int rc;
1437
1438         smc = smc_sk(sk);
1439         lock_sock(sk);
1440
1441         rc = -EINVAL;
1442         if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
1443             smc->connect_nonblock)
1444                 goto out;
1445
1446         rc = 0;
1447         if (sk->sk_state == SMC_LISTEN) {
1448                 sk->sk_max_ack_backlog = backlog;
1449                 goto out;
1450         }
1451         /* some socket options are handled in core, so we could not apply
1452          * them to the clc socket -- copy smc socket options to clc socket
1453          */
1454         smc_copy_sock_settings_to_clc(smc);
1455         if (!smc->use_fallback)
1456                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1457
1458         rc = kernel_listen(smc->clcsock, backlog);
1459         if (rc)
1460                 goto out;
1461         sk->sk_max_ack_backlog = backlog;
1462         sk->sk_ack_backlog = 0;
1463         sk->sk_state = SMC_LISTEN;
1464         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1465         if (!schedule_work(&smc->tcp_listen_work))
1466                 sock_put(sk);
1467
1468 out:
1469         release_sock(sk);
1470         return rc;
1471 }
1472
1473 static int smc_accept(struct socket *sock, struct socket *new_sock,
1474                       int flags, bool kern)
1475 {
1476         struct sock *sk = sock->sk, *nsk;
1477         DECLARE_WAITQUEUE(wait, current);
1478         struct smc_sock *lsmc;
1479         long timeo;
1480         int rc = 0;
1481
1482         lsmc = smc_sk(sk);
1483         sock_hold(sk); /* sock_put below */
1484         lock_sock(sk);
1485
1486         if (lsmc->sk.sk_state != SMC_LISTEN) {
1487                 rc = -EINVAL;
1488                 release_sock(sk);
1489                 goto out;
1490         }
1491
1492         /* Wait for an incoming connection */
1493         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1494         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1495         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1496                 set_current_state(TASK_INTERRUPTIBLE);
1497                 if (!timeo) {
1498                         rc = -EAGAIN;
1499                         break;
1500                 }
1501                 release_sock(sk);
1502                 timeo = schedule_timeout(timeo);
1503                 /* wakeup by sk_data_ready in smc_listen_work() */
1504                 sched_annotate_sleep();
1505                 lock_sock(sk);
1506                 if (signal_pending(current)) {
1507                         rc = sock_intr_errno(timeo);
1508                         break;
1509                 }
1510         }
1511         set_current_state(TASK_RUNNING);
1512         remove_wait_queue(sk_sleep(sk), &wait);
1513
1514         if (!rc)
1515                 rc = sock_error(nsk);
1516         release_sock(sk);
1517         if (rc)
1518                 goto out;
1519
1520         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1521                 /* wait till data arrives on the socket */
1522                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1523                                                                 MSEC_PER_SEC);
1524                 if (smc_sk(nsk)->use_fallback) {
1525                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1526
1527                         lock_sock(clcsk);
1528                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1529                                 sk_wait_data(clcsk, &timeo, NULL);
1530                         release_sock(clcsk);
1531                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1532                         lock_sock(nsk);
1533                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1534                         release_sock(nsk);
1535                 }
1536         }
1537
1538 out:
1539         sock_put(sk); /* sock_hold above */
1540         return rc;
1541 }
1542
1543 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1544                        int peer)
1545 {
1546         struct smc_sock *smc;
1547
1548         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1549             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1550                 return -ENOTCONN;
1551
1552         smc = smc_sk(sock->sk);
1553
1554         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1555 }
1556
1557 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1558 {
1559         struct sock *sk = sock->sk;
1560         struct smc_sock *smc;
1561         int rc = -EPIPE;
1562
1563         smc = smc_sk(sk);
1564         lock_sock(sk);
1565         if ((sk->sk_state != SMC_ACTIVE) &&
1566             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1567             (sk->sk_state != SMC_INIT))
1568                 goto out;
1569
1570         if (msg->msg_flags & MSG_FASTOPEN) {
1571                 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1572                         smc_switch_to_fallback(smc);
1573                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1574                 } else {
1575                         rc = -EINVAL;
1576                         goto out;
1577                 }
1578         }
1579
1580         if (smc->use_fallback)
1581                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1582         else
1583                 rc = smc_tx_sendmsg(smc, msg, len);
1584 out:
1585         release_sock(sk);
1586         return rc;
1587 }
1588
1589 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1590                        int flags)
1591 {
1592         struct sock *sk = sock->sk;
1593         struct smc_sock *smc;
1594         int rc = -ENOTCONN;
1595
1596         smc = smc_sk(sk);
1597         lock_sock(sk);
1598         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1599                 /* socket was connected before, no more data to read */
1600                 rc = 0;
1601                 goto out;
1602         }
1603         if ((sk->sk_state == SMC_INIT) ||
1604             (sk->sk_state == SMC_LISTEN) ||
1605             (sk->sk_state == SMC_CLOSED))
1606                 goto out;
1607
1608         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1609                 rc = 0;
1610                 goto out;
1611         }
1612
1613         if (smc->use_fallback) {
1614                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1615         } else {
1616                 msg->msg_namelen = 0;
1617                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1618         }
1619
1620 out:
1621         release_sock(sk);
1622         return rc;
1623 }
1624
1625 static __poll_t smc_accept_poll(struct sock *parent)
1626 {
1627         struct smc_sock *isk = smc_sk(parent);
1628         __poll_t mask = 0;
1629
1630         spin_lock(&isk->accept_q_lock);
1631         if (!list_empty(&isk->accept_q))
1632                 mask = EPOLLIN | EPOLLRDNORM;
1633         spin_unlock(&isk->accept_q_lock);
1634
1635         return mask;
1636 }
1637
1638 static __poll_t smc_poll(struct file *file, struct socket *sock,
1639                              poll_table *wait)
1640 {
1641         struct sock *sk = sock->sk;
1642         struct smc_sock *smc;
1643         __poll_t mask = 0;
1644
1645         if (!sk)
1646                 return EPOLLNVAL;
1647
1648         smc = smc_sk(sock->sk);
1649         if (smc->use_fallback) {
1650                 /* delegate to CLC child sock */
1651                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1652                 sk->sk_err = smc->clcsock->sk->sk_err;
1653         } else {
1654                 if (sk->sk_state != SMC_CLOSED)
1655                         sock_poll_wait(file, sock, wait);
1656                 if (sk->sk_err)
1657                         mask |= EPOLLERR;
1658                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1659                     (sk->sk_state == SMC_CLOSED))
1660                         mask |= EPOLLHUP;
1661                 if (sk->sk_state == SMC_LISTEN) {
1662                         /* woken up by sk_data_ready in smc_listen_work() */
1663                         mask |= smc_accept_poll(sk);
1664                 } else if (smc->use_fallback) { /* as result of connect_work()*/
1665                         mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1666                                                            wait);
1667                         sk->sk_err = smc->clcsock->sk->sk_err;
1668                 } else {
1669                         if ((sk->sk_state != SMC_INIT &&
1670                              atomic_read(&smc->conn.sndbuf_space)) ||
1671                             sk->sk_shutdown & SEND_SHUTDOWN) {
1672                                 mask |= EPOLLOUT | EPOLLWRNORM;
1673                         } else {
1674                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1675                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1676                         }
1677                         if (atomic_read(&smc->conn.bytes_to_rcv))
1678                                 mask |= EPOLLIN | EPOLLRDNORM;
1679                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1680                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1681                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1682                                 mask |= EPOLLIN;
1683                         if (smc->conn.urg_state == SMC_URG_VALID)
1684                                 mask |= EPOLLPRI;
1685                 }
1686         }
1687
1688         return mask;
1689 }
1690
1691 static int smc_shutdown(struct socket *sock, int how)
1692 {
1693         struct sock *sk = sock->sk;
1694         struct smc_sock *smc;
1695         int rc = -EINVAL;
1696         int rc1 = 0;
1697
1698         smc = smc_sk(sk);
1699
1700         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1701                 return rc;
1702
1703         lock_sock(sk);
1704
1705         rc = -ENOTCONN;
1706         if ((sk->sk_state != SMC_ACTIVE) &&
1707             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1708             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1709             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1710             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1711             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1712                 goto out;
1713         if (smc->use_fallback) {
1714                 rc = kernel_sock_shutdown(smc->clcsock, how);
1715                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1716                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1717                         sk->sk_state = SMC_CLOSED;
1718                 goto out;
1719         }
1720         switch (how) {
1721         case SHUT_RDWR:         /* shutdown in both directions */
1722                 rc = smc_close_active(smc);
1723                 break;
1724         case SHUT_WR:
1725                 rc = smc_close_shutdown_write(smc);
1726                 break;
1727         case SHUT_RD:
1728                 rc = 0;
1729                 /* nothing more to do because peer is not involved */
1730                 break;
1731         }
1732         if (smc->clcsock)
1733                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1734         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1735         sk->sk_shutdown |= how + 1;
1736
1737 out:
1738         release_sock(sk);
1739         return rc ? rc : rc1;
1740 }
1741
1742 static int smc_setsockopt(struct socket *sock, int level, int optname,
1743                           sockptr_t optval, unsigned int optlen)
1744 {
1745         struct sock *sk = sock->sk;
1746         struct smc_sock *smc;
1747         int val, rc;
1748
1749         smc = smc_sk(sk);
1750
1751         /* generic setsockopts reaching us here always apply to the
1752          * CLC socket
1753          */
1754         if (unlikely(!smc->clcsock->ops->setsockopt))
1755                 rc = -EOPNOTSUPP;
1756         else
1757                 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1758                                                    optval, optlen);
1759         if (smc->clcsock->sk->sk_err) {
1760                 sk->sk_err = smc->clcsock->sk->sk_err;
1761                 sk->sk_error_report(sk);
1762         }
1763
1764         if (optlen < sizeof(int))
1765                 return -EINVAL;
1766         if (copy_from_sockptr(&val, optval, sizeof(int)))
1767                 return -EFAULT;
1768
1769         lock_sock(sk);
1770         if (rc || smc->use_fallback)
1771                 goto out;
1772         switch (optname) {
1773         case TCP_ULP:
1774         case TCP_FASTOPEN:
1775         case TCP_FASTOPEN_CONNECT:
1776         case TCP_FASTOPEN_KEY:
1777         case TCP_FASTOPEN_NO_COOKIE:
1778                 /* option not supported by SMC */
1779                 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1780                         smc_switch_to_fallback(smc);
1781                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1782                 } else {
1783                         rc = -EINVAL;
1784                 }
1785                 break;
1786         case TCP_NODELAY:
1787                 if (sk->sk_state != SMC_INIT &&
1788                     sk->sk_state != SMC_LISTEN &&
1789                     sk->sk_state != SMC_CLOSED) {
1790                         if (val)
1791                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1792                                                  0);
1793                 }
1794                 break;
1795         case TCP_CORK:
1796                 if (sk->sk_state != SMC_INIT &&
1797                     sk->sk_state != SMC_LISTEN &&
1798                     sk->sk_state != SMC_CLOSED) {
1799                         if (!val)
1800                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1801                                                  0);
1802                 }
1803                 break;
1804         case TCP_DEFER_ACCEPT:
1805                 smc->sockopt_defer_accept = val;
1806                 break;
1807         default:
1808                 break;
1809         }
1810 out:
1811         release_sock(sk);
1812
1813         return rc;
1814 }
1815
1816 static int smc_getsockopt(struct socket *sock, int level, int optname,
1817                           char __user *optval, int __user *optlen)
1818 {
1819         struct smc_sock *smc;
1820
1821         smc = smc_sk(sock->sk);
1822         /* socket options apply to the CLC socket */
1823         if (unlikely(!smc->clcsock->ops->getsockopt))
1824                 return -EOPNOTSUPP;
1825         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1826                                              optval, optlen);
1827 }
1828
1829 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1830                      unsigned long arg)
1831 {
1832         union smc_host_cursor cons, urg;
1833         struct smc_connection *conn;
1834         struct smc_sock *smc;
1835         int answ;
1836
1837         smc = smc_sk(sock->sk);
1838         conn = &smc->conn;
1839         lock_sock(&smc->sk);
1840         if (smc->use_fallback) {
1841                 if (!smc->clcsock) {
1842                         release_sock(&smc->sk);
1843                         return -EBADF;
1844                 }
1845                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1846                 release_sock(&smc->sk);
1847                 return answ;
1848         }
1849         switch (cmd) {
1850         case SIOCINQ: /* same as FIONREAD */
1851                 if (smc->sk.sk_state == SMC_LISTEN) {
1852                         release_sock(&smc->sk);
1853                         return -EINVAL;
1854                 }
1855                 if (smc->sk.sk_state == SMC_INIT ||
1856                     smc->sk.sk_state == SMC_CLOSED)
1857                         answ = 0;
1858                 else
1859                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1860                 break;
1861         case SIOCOUTQ:
1862                 /* output queue size (not send + not acked) */
1863                 if (smc->sk.sk_state == SMC_LISTEN) {
1864                         release_sock(&smc->sk);
1865                         return -EINVAL;
1866                 }
1867                 if (smc->sk.sk_state == SMC_INIT ||
1868                     smc->sk.sk_state == SMC_CLOSED)
1869                         answ = 0;
1870                 else
1871                         answ = smc->conn.sndbuf_desc->len -
1872                                         atomic_read(&smc->conn.sndbuf_space);
1873                 break;
1874         case SIOCOUTQNSD:
1875                 /* output queue size (not send only) */
1876                 if (smc->sk.sk_state == SMC_LISTEN) {
1877                         release_sock(&smc->sk);
1878                         return -EINVAL;
1879                 }
1880                 if (smc->sk.sk_state == SMC_INIT ||
1881                     smc->sk.sk_state == SMC_CLOSED)
1882                         answ = 0;
1883                 else
1884                         answ = smc_tx_prepared_sends(&smc->conn);
1885                 break;
1886         case SIOCATMARK:
1887                 if (smc->sk.sk_state == SMC_LISTEN) {
1888                         release_sock(&smc->sk);
1889                         return -EINVAL;
1890                 }
1891                 if (smc->sk.sk_state == SMC_INIT ||
1892                     smc->sk.sk_state == SMC_CLOSED) {
1893                         answ = 0;
1894                 } else {
1895                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1896                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1897                         answ = smc_curs_diff(conn->rmb_desc->len,
1898                                              &cons, &urg) == 1;
1899                 }
1900                 break;
1901         default:
1902                 release_sock(&smc->sk);
1903                 return -ENOIOCTLCMD;
1904         }
1905         release_sock(&smc->sk);
1906
1907         return put_user(answ, (int __user *)arg);
1908 }
1909
1910 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1911                             int offset, size_t size, int flags)
1912 {
1913         struct sock *sk = sock->sk;
1914         struct smc_sock *smc;
1915         int rc = -EPIPE;
1916
1917         smc = smc_sk(sk);
1918         lock_sock(sk);
1919         if (sk->sk_state != SMC_ACTIVE) {
1920                 release_sock(sk);
1921                 goto out;
1922         }
1923         release_sock(sk);
1924         if (smc->use_fallback)
1925                 rc = kernel_sendpage(smc->clcsock, page, offset,
1926                                      size, flags);
1927         else
1928                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1929
1930 out:
1931         return rc;
1932 }
1933
1934 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1935  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1936  * updates till whenever a respective page has been fully processed.
1937  * Note that subsequent recv() calls have to wait till all splice() processing
1938  * completed.
1939  */
1940 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1941                                struct pipe_inode_info *pipe, size_t len,
1942                                unsigned int flags)
1943 {
1944         struct sock *sk = sock->sk;
1945         struct smc_sock *smc;
1946         int rc = -ENOTCONN;
1947
1948         smc = smc_sk(sk);
1949         lock_sock(sk);
1950         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1951                 /* socket was connected before, no more data to read */
1952                 rc = 0;
1953                 goto out;
1954         }
1955         if (sk->sk_state == SMC_INIT ||
1956             sk->sk_state == SMC_LISTEN ||
1957             sk->sk_state == SMC_CLOSED)
1958                 goto out;
1959
1960         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1961                 rc = 0;
1962                 goto out;
1963         }
1964
1965         if (smc->use_fallback) {
1966                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1967                                                     pipe, len, flags);
1968         } else {
1969                 if (*ppos) {
1970                         rc = -ESPIPE;
1971                         goto out;
1972                 }
1973                 if (flags & SPLICE_F_NONBLOCK)
1974                         flags = MSG_DONTWAIT;
1975                 else
1976                         flags = 0;
1977                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1978         }
1979 out:
1980         release_sock(sk);
1981
1982         return rc;
1983 }
1984
1985 /* must look like tcp */
1986 static const struct proto_ops smc_sock_ops = {
1987         .family         = PF_SMC,
1988         .owner          = THIS_MODULE,
1989         .release        = smc_release,
1990         .bind           = smc_bind,
1991         .connect        = smc_connect,
1992         .socketpair     = sock_no_socketpair,
1993         .accept         = smc_accept,
1994         .getname        = smc_getname,
1995         .poll           = smc_poll,
1996         .ioctl          = smc_ioctl,
1997         .listen         = smc_listen,
1998         .shutdown       = smc_shutdown,
1999         .setsockopt     = smc_setsockopt,
2000         .getsockopt     = smc_getsockopt,
2001         .sendmsg        = smc_sendmsg,
2002         .recvmsg        = smc_recvmsg,
2003         .mmap           = sock_no_mmap,
2004         .sendpage       = smc_sendpage,
2005         .splice_read    = smc_splice_read,
2006 };
2007
2008 static int smc_create(struct net *net, struct socket *sock, int protocol,
2009                       int kern)
2010 {
2011         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
2012         struct smc_sock *smc;
2013         struct sock *sk;
2014         int rc;
2015
2016         rc = -ESOCKTNOSUPPORT;
2017         if (sock->type != SOCK_STREAM)
2018                 goto out;
2019
2020         rc = -EPROTONOSUPPORT;
2021         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
2022                 goto out;
2023
2024         rc = -ENOBUFS;
2025         sock->ops = &smc_sock_ops;
2026         sk = smc_sock_alloc(net, sock, protocol);
2027         if (!sk)
2028                 goto out;
2029
2030         /* create internal TCP socket for CLC handshake and fallback */
2031         smc = smc_sk(sk);
2032         smc->use_fallback = false; /* assume rdma capability first */
2033         smc->fallback_rsn = 0;
2034         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
2035                               &smc->clcsock);
2036         if (rc) {
2037                 sk_common_release(sk);
2038                 goto out;
2039         }
2040         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
2041         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
2042
2043 out:
2044         return rc;
2045 }
2046
2047 static const struct net_proto_family smc_sock_family_ops = {
2048         .family = PF_SMC,
2049         .owner  = THIS_MODULE,
2050         .create = smc_create,
2051 };
2052
2053 unsigned int smc_net_id;
2054
2055 static __net_init int smc_net_init(struct net *net)
2056 {
2057         return smc_pnet_net_init(net);
2058 }
2059
2060 static void __net_exit smc_net_exit(struct net *net)
2061 {
2062         smc_pnet_net_exit(net);
2063 }
2064
2065 static struct pernet_operations smc_net_ops = {
2066         .init = smc_net_init,
2067         .exit = smc_net_exit,
2068         .id   = &smc_net_id,
2069         .size = sizeof(struct smc_net),
2070 };
2071
2072 static int __init smc_init(void)
2073 {
2074         int rc;
2075
2076         rc = register_pernet_subsys(&smc_net_ops);
2077         if (rc)
2078                 return rc;
2079
2080         rc = smc_pnet_init();
2081         if (rc)
2082                 goto out_pernet_subsys;
2083
2084         rc = smc_core_init();
2085         if (rc) {
2086                 pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
2087                 goto out_pnet;
2088         }
2089
2090         rc = smc_llc_init();
2091         if (rc) {
2092                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2093                 goto out_core;
2094         }
2095
2096         rc = smc_cdc_init();
2097         if (rc) {
2098                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2099                 goto out_core;
2100         }
2101
2102         rc = proto_register(&smc_proto, 1);
2103         if (rc) {
2104                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2105                 goto out_core;
2106         }
2107
2108         rc = proto_register(&smc_proto6, 1);
2109         if (rc) {
2110                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2111                 goto out_proto;
2112         }
2113
2114         rc = sock_register(&smc_sock_family_ops);
2115         if (rc) {
2116                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
2117                 goto out_proto6;
2118         }
2119         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2120         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2121
2122         rc = smc_ib_register_client();
2123         if (rc) {
2124                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
2125                 goto out_sock;
2126         }
2127
2128         static_branch_enable(&tcp_have_smc);
2129         return 0;
2130
2131 out_sock:
2132         sock_unregister(PF_SMC);
2133 out_proto6:
2134         proto_unregister(&smc_proto6);
2135 out_proto:
2136         proto_unregister(&smc_proto);
2137 out_core:
2138         smc_core_exit();
2139 out_pnet:
2140         smc_pnet_exit();
2141 out_pernet_subsys:
2142         unregister_pernet_subsys(&smc_net_ops);
2143
2144         return rc;
2145 }
2146
2147 static void __exit smc_exit(void)
2148 {
2149         static_branch_disable(&tcp_have_smc);
2150         sock_unregister(PF_SMC);
2151         smc_core_exit();
2152         smc_ib_unregister_client();
2153         proto_unregister(&smc_proto6);
2154         proto_unregister(&smc_proto);
2155         smc_pnet_exit();
2156         unregister_pernet_subsys(&smc_net_ops);
2157         rcu_barrier();
2158 }
2159
2160 module_init(smc_init);
2161 module_exit(smc_exit);
2162
2163 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2164 MODULE_DESCRIPTION("smc socket address family");
2165 MODULE_LICENSE("GPL");
2166 MODULE_ALIAS_NETPROTO(PF_SMC);