xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order...
[linux-2.6-microblaze.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *
12  *  Copyright IBM Corp. 2016, 2018
13  *
14  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
15  *              based on prototype from Frank Blaschka
16  */
17
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
24 #include <linux/in.h>
25 #include <linux/sched/signal.h>
26 #include <linux/if_vlan.h>
27
28 #include <net/sock.h>
29 #include <net/tcp.h>
30 #include <net/smc.h>
31 #include <asm/ioctls.h>
32
33 #include "smc.h"
34 #include "smc_clc.h"
35 #include "smc_llc.h"
36 #include "smc_cdc.h"
37 #include "smc_core.h"
38 #include "smc_ib.h"
39 #include "smc_ism.h"
40 #include "smc_pnet.h"
41 #include "smc_tx.h"
42 #include "smc_rx.h"
43 #include "smc_close.h"
44
45 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
46                                                  * creation
47                                                  */
48
49 static void smc_tcp_listen_work(struct work_struct *);
50 static void smc_connect_work(struct work_struct *);
51
52 static void smc_set_keepalive(struct sock *sk, int val)
53 {
54         struct smc_sock *smc = smc_sk(sk);
55
56         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
57 }
58
59 static struct smc_hashinfo smc_v4_hashinfo = {
60         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
61 };
62
63 static struct smc_hashinfo smc_v6_hashinfo = {
64         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
65 };
66
67 int smc_hash_sk(struct sock *sk)
68 {
69         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
70         struct hlist_head *head;
71
72         head = &h->ht;
73
74         write_lock_bh(&h->lock);
75         sk_add_node(sk, head);
76         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
77         write_unlock_bh(&h->lock);
78
79         return 0;
80 }
81 EXPORT_SYMBOL_GPL(smc_hash_sk);
82
83 void smc_unhash_sk(struct sock *sk)
84 {
85         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
86
87         write_lock_bh(&h->lock);
88         if (sk_del_node_init(sk))
89                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
90         write_unlock_bh(&h->lock);
91 }
92 EXPORT_SYMBOL_GPL(smc_unhash_sk);
93
94 struct proto smc_proto = {
95         .name           = "SMC",
96         .owner          = THIS_MODULE,
97         .keepalive      = smc_set_keepalive,
98         .hash           = smc_hash_sk,
99         .unhash         = smc_unhash_sk,
100         .obj_size       = sizeof(struct smc_sock),
101         .h.smc_hash     = &smc_v4_hashinfo,
102         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
103 };
104 EXPORT_SYMBOL_GPL(smc_proto);
105
106 struct proto smc_proto6 = {
107         .name           = "SMC6",
108         .owner          = THIS_MODULE,
109         .keepalive      = smc_set_keepalive,
110         .hash           = smc_hash_sk,
111         .unhash         = smc_unhash_sk,
112         .obj_size       = sizeof(struct smc_sock),
113         .h.smc_hash     = &smc_v6_hashinfo,
114         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
115 };
116 EXPORT_SYMBOL_GPL(smc_proto6);
117
118 static int smc_release(struct socket *sock)
119 {
120         struct sock *sk = sock->sk;
121         struct smc_sock *smc;
122         int rc = 0;
123
124         if (!sk)
125                 goto out;
126
127         smc = smc_sk(sk);
128
129         /* cleanup for a dangling non-blocking connect */
130         if (smc->connect_info && sk->sk_state == SMC_INIT)
131                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
132         flush_work(&smc->connect_work);
133         kfree(smc->connect_info);
134         smc->connect_info = NULL;
135
136         if (sk->sk_state == SMC_LISTEN)
137                 /* smc_close_non_accepted() is called and acquires
138                  * sock lock for child sockets again
139                  */
140                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
141         else
142                 lock_sock(sk);
143
144         if (!smc->use_fallback) {
145                 rc = smc_close_active(smc);
146                 sock_set_flag(sk, SOCK_DEAD);
147                 sk->sk_shutdown |= SHUTDOWN_MASK;
148         }
149         if (smc->clcsock) {
150                 if (smc->use_fallback && sk->sk_state == SMC_LISTEN) {
151                         /* wake up clcsock accept */
152                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
153                 }
154                 mutex_lock(&smc->clcsock_release_lock);
155                 sock_release(smc->clcsock);
156                 smc->clcsock = NULL;
157                 mutex_unlock(&smc->clcsock_release_lock);
158         }
159         if (smc->use_fallback) {
160                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
161                         sock_put(sk); /* passive closing */
162                 sk->sk_state = SMC_CLOSED;
163                 sk->sk_state_change(sk);
164         }
165
166         /* detach socket */
167         sock_orphan(sk);
168         sock->sk = NULL;
169         if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
170                 smc_conn_free(&smc->conn);
171         release_sock(sk);
172
173         sk->sk_prot->unhash(sk);
174         sock_put(sk); /* final sock_put */
175 out:
176         return rc;
177 }
178
179 static void smc_destruct(struct sock *sk)
180 {
181         if (sk->sk_state != SMC_CLOSED)
182                 return;
183         if (!sock_flag(sk, SOCK_DEAD))
184                 return;
185
186         sk_refcnt_debug_dec(sk);
187 }
188
189 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
190                                    int protocol)
191 {
192         struct smc_sock *smc;
193         struct proto *prot;
194         struct sock *sk;
195
196         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
197         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
198         if (!sk)
199                 return NULL;
200
201         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
202         sk->sk_state = SMC_INIT;
203         sk->sk_destruct = smc_destruct;
204         sk->sk_protocol = protocol;
205         smc = smc_sk(sk);
206         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
207         INIT_WORK(&smc->connect_work, smc_connect_work);
208         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
209         INIT_LIST_HEAD(&smc->accept_q);
210         spin_lock_init(&smc->accept_q_lock);
211         spin_lock_init(&smc->conn.send_lock);
212         sk->sk_prot->hash(sk);
213         sk_refcnt_debug_inc(sk);
214         mutex_init(&smc->clcsock_release_lock);
215
216         return sk;
217 }
218
219 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
220                     int addr_len)
221 {
222         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
223         struct sock *sk = sock->sk;
224         struct smc_sock *smc;
225         int rc;
226
227         smc = smc_sk(sk);
228
229         /* replicate tests from inet_bind(), to be safe wrt. future changes */
230         rc = -EINVAL;
231         if (addr_len < sizeof(struct sockaddr_in))
232                 goto out;
233
234         rc = -EAFNOSUPPORT;
235         if (addr->sin_family != AF_INET &&
236             addr->sin_family != AF_INET6 &&
237             addr->sin_family != AF_UNSPEC)
238                 goto out;
239         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
240         if (addr->sin_family == AF_UNSPEC &&
241             addr->sin_addr.s_addr != htonl(INADDR_ANY))
242                 goto out;
243
244         lock_sock(sk);
245
246         /* Check if socket is already active */
247         rc = -EINVAL;
248         if (sk->sk_state != SMC_INIT)
249                 goto out_rel;
250
251         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
252         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
253
254 out_rel:
255         release_sock(sk);
256 out:
257         return rc;
258 }
259
260 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
261                                    unsigned long mask)
262 {
263         /* options we don't get control via setsockopt for */
264         nsk->sk_type = osk->sk_type;
265         nsk->sk_sndbuf = osk->sk_sndbuf;
266         nsk->sk_rcvbuf = osk->sk_rcvbuf;
267         nsk->sk_sndtimeo = osk->sk_sndtimeo;
268         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
269         nsk->sk_mark = osk->sk_mark;
270         nsk->sk_priority = osk->sk_priority;
271         nsk->sk_rcvlowat = osk->sk_rcvlowat;
272         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
273         nsk->sk_err = osk->sk_err;
274
275         nsk->sk_flags &= ~mask;
276         nsk->sk_flags |= osk->sk_flags & mask;
277 }
278
279 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
280                              (1UL << SOCK_KEEPOPEN) | \
281                              (1UL << SOCK_LINGER) | \
282                              (1UL << SOCK_BROADCAST) | \
283                              (1UL << SOCK_TIMESTAMP) | \
284                              (1UL << SOCK_DBG) | \
285                              (1UL << SOCK_RCVTSTAMP) | \
286                              (1UL << SOCK_RCVTSTAMPNS) | \
287                              (1UL << SOCK_LOCALROUTE) | \
288                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
289                              (1UL << SOCK_RXQ_OVFL) | \
290                              (1UL << SOCK_WIFI_STATUS) | \
291                              (1UL << SOCK_NOFCS) | \
292                              (1UL << SOCK_FILTER_LOCKED))
293 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
294  * clc socket (since smc is not called for these options from net/core)
295  */
296 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
297 {
298         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
299 }
300
301 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
302                              (1UL << SOCK_KEEPOPEN) | \
303                              (1UL << SOCK_LINGER) | \
304                              (1UL << SOCK_DBG))
305 /* copy only settings and flags relevant for smc from clc to smc socket */
306 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
307 {
308         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
309 }
310
311 /* register a new rmb, send confirm_rkey msg to register with peer */
312 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
313                        bool conf_rkey)
314 {
315         if (!rmb_desc->wr_reg) {
316                 /* register memory region for new rmb */
317                 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
318                         rmb_desc->regerr = 1;
319                         return -EFAULT;
320                 }
321                 rmb_desc->wr_reg = 1;
322         }
323         if (!conf_rkey)
324                 return 0;
325         /* exchange confirm_rkey msg with peer */
326         if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
327                 rmb_desc->regerr = 1;
328                 return -EFAULT;
329         }
330         return 0;
331 }
332
333 static int smc_clnt_conf_first_link(struct smc_sock *smc)
334 {
335         struct net *net = sock_net(smc->clcsock->sk);
336         struct smc_link_group *lgr = smc->conn.lgr;
337         struct smc_link *link;
338         int rest;
339         int rc;
340
341         link = &lgr->lnk[SMC_SINGLE_LINK];
342         /* receive CONFIRM LINK request from server over RoCE fabric */
343         rest = wait_for_completion_interruptible_timeout(
344                 &link->llc_confirm,
345                 SMC_LLC_WAIT_FIRST_TIME);
346         if (rest <= 0) {
347                 struct smc_clc_msg_decline dclc;
348
349                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
350                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
351                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
352         }
353
354         if (link->llc_confirm_rc)
355                 return SMC_CLC_DECL_RMBE_EC;
356
357         rc = smc_ib_modify_qp_rts(link);
358         if (rc)
359                 return SMC_CLC_DECL_ERR_RDYLNK;
360
361         smc_wr_remember_qp_attr(link);
362
363         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
364                 return SMC_CLC_DECL_ERR_REGRMB;
365
366         /* send CONFIRM LINK response over RoCE fabric */
367         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
368         if (rc < 0)
369                 return SMC_CLC_DECL_TIMEOUT_CL;
370
371         /* receive ADD LINK request from server over RoCE fabric */
372         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
373                                                          SMC_LLC_WAIT_TIME);
374         if (rest <= 0) {
375                 struct smc_clc_msg_decline dclc;
376
377                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
378                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
379                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
380         }
381
382         /* send add link reject message, only one link supported for now */
383         rc = smc_llc_send_add_link(link,
384                                    link->smcibdev->mac[link->ibport - 1],
385                                    link->gid, SMC_LLC_RESP);
386         if (rc < 0)
387                 return SMC_CLC_DECL_TIMEOUT_AL;
388
389         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
390
391         return 0;
392 }
393
394 static void smcr_conn_save_peer_info(struct smc_sock *smc,
395                                      struct smc_clc_msg_accept_confirm *clc)
396 {
397         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
398
399         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
400         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
401         smc->conn.peer_rmbe_size = bufsize;
402         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
403         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
404 }
405
406 static void smcd_conn_save_peer_info(struct smc_sock *smc,
407                                      struct smc_clc_msg_accept_confirm *clc)
408 {
409         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
410
411         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
412         smc->conn.peer_token = clc->token;
413         /* msg header takes up space in the buffer */
414         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
415         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
416         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
417 }
418
419 static void smc_conn_save_peer_info(struct smc_sock *smc,
420                                     struct smc_clc_msg_accept_confirm *clc)
421 {
422         if (smc->conn.lgr->is_smcd)
423                 smcd_conn_save_peer_info(smc, clc);
424         else
425                 smcr_conn_save_peer_info(smc, clc);
426 }
427
428 static void smc_link_save_peer_info(struct smc_link *link,
429                                     struct smc_clc_msg_accept_confirm *clc)
430 {
431         link->peer_qpn = ntoh24(clc->qpn);
432         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
433         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
434         link->peer_psn = ntoh24(clc->psn);
435         link->peer_mtu = clc->qp_mtu;
436 }
437
438 /* fall back during connect */
439 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
440 {
441         smc->use_fallback = true;
442         smc->fallback_rsn = reason_code;
443         smc_copy_sock_settings_to_clc(smc);
444         if (smc->sk.sk_state == SMC_INIT)
445                 smc->sk.sk_state = SMC_ACTIVE;
446         return 0;
447 }
448
449 /* decline and fall back during connect */
450 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
451 {
452         int rc;
453
454         if (reason_code < 0) { /* error, fallback is not possible */
455                 if (smc->sk.sk_state == SMC_INIT)
456                         sock_put(&smc->sk); /* passive closing */
457                 return reason_code;
458         }
459         if (reason_code != SMC_CLC_DECL_PEERDECL) {
460                 rc = smc_clc_send_decline(smc, reason_code);
461                 if (rc < 0) {
462                         if (smc->sk.sk_state == SMC_INIT)
463                                 sock_put(&smc->sk); /* passive closing */
464                         return rc;
465                 }
466         }
467         return smc_connect_fallback(smc, reason_code);
468 }
469
470 /* abort connecting */
471 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
472                              int local_contact)
473 {
474         if (local_contact == SMC_FIRST_CONTACT)
475                 smc_lgr_forget(smc->conn.lgr);
476         mutex_unlock(&smc_create_lgr_pending);
477         smc_conn_free(&smc->conn);
478         return reason_code;
479 }
480
481 /* check if there is a rdma device available for this connection. */
482 /* called for connect and listen */
483 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
484                           u8 *ibport, unsigned short vlan_id, u8 gid[])
485 {
486         int reason_code = 0;
487
488         /* PNET table look up: search active ib_device and port
489          * within same PNETID that also contains the ethernet device
490          * used for the internal TCP socket
491          */
492         smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
493                                     gid);
494         if (!(*ibdev))
495                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
496
497         return reason_code;
498 }
499
500 /* check if there is an ISM device available for this connection. */
501 /* called for connect and listen */
502 static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
503 {
504         /* Find ISM device with same PNETID as connecting interface  */
505         smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
506         if (!(*ismdev))
507                 return SMC_CLC_DECL_CNFERR; /* configuration error */
508         return 0;
509 }
510
511 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
512 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
513                                       struct smcd_dev *ismdev,
514                                       unsigned short vlan_id)
515 {
516         if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
517                 return SMC_CLC_DECL_CNFERR;
518         return 0;
519 }
520
521 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
522  * used, the VLAN ID will be registered again during the connection setup.
523  */
524 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
525                                         struct smcd_dev *ismdev,
526                                         unsigned short vlan_id)
527 {
528         if (!is_smcd)
529                 return 0;
530         if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
531                 return SMC_CLC_DECL_CNFERR;
532         return 0;
533 }
534
535 /* CLC handshake during connect */
536 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
537                            struct smc_clc_msg_accept_confirm *aclc,
538                            struct smc_ib_device *ibdev, u8 ibport,
539                            u8 gid[], struct smcd_dev *ismdev)
540 {
541         int rc = 0;
542
543         /* do inband token exchange */
544         rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
545         if (rc)
546                 return rc;
547         /* receive SMC Accept CLC message */
548         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
549                                 CLC_WAIT_TIME);
550 }
551
552 /* setup for RDMA connection of client */
553 static int smc_connect_rdma(struct smc_sock *smc,
554                             struct smc_clc_msg_accept_confirm *aclc,
555                             struct smc_ib_device *ibdev, u8 ibport)
556 {
557         int local_contact = SMC_FIRST_CONTACT;
558         struct smc_link *link;
559         int reason_code = 0;
560
561         mutex_lock(&smc_create_lgr_pending);
562         local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
563                                         ibport, ntoh24(aclc->qpn), &aclc->lcl,
564                                         NULL, 0);
565         if (local_contact < 0) {
566                 if (local_contact == -ENOMEM)
567                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
568                 else if (local_contact == -ENOLINK)
569                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
570                 else
571                         reason_code = SMC_CLC_DECL_INTERR; /* other error */
572                 return smc_connect_abort(smc, reason_code, 0);
573         }
574         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
575
576         smc_conn_save_peer_info(smc, aclc);
577
578         /* create send buffer and rmb */
579         if (smc_buf_create(smc, false))
580                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
581
582         if (local_contact == SMC_FIRST_CONTACT)
583                 smc_link_save_peer_info(link, aclc);
584
585         if (smc_rmb_rtoken_handling(&smc->conn, aclc))
586                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
587                                          local_contact);
588
589         smc_close_init(smc);
590         smc_rx_init(smc);
591
592         if (local_contact == SMC_FIRST_CONTACT) {
593                 if (smc_ib_ready_link(link))
594                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
595                                                  local_contact);
596         } else {
597                 if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
598                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
599                                                  local_contact);
600         }
601         smc_rmb_sync_sg_for_device(&smc->conn);
602
603         reason_code = smc_clc_send_confirm(smc);
604         if (reason_code)
605                 return smc_connect_abort(smc, reason_code, local_contact);
606
607         smc_tx_init(smc);
608
609         if (local_contact == SMC_FIRST_CONTACT) {
610                 /* QP confirmation over RoCE fabric */
611                 reason_code = smc_clnt_conf_first_link(smc);
612                 if (reason_code)
613                         return smc_connect_abort(smc, reason_code,
614                                                  local_contact);
615         }
616         mutex_unlock(&smc_create_lgr_pending);
617
618         smc_copy_sock_settings_to_clc(smc);
619         if (smc->sk.sk_state == SMC_INIT)
620                 smc->sk.sk_state = SMC_ACTIVE;
621
622         return 0;
623 }
624
625 /* setup for ISM connection of client */
626 static int smc_connect_ism(struct smc_sock *smc,
627                            struct smc_clc_msg_accept_confirm *aclc,
628                            struct smcd_dev *ismdev)
629 {
630         int local_contact = SMC_FIRST_CONTACT;
631         int rc = 0;
632
633         mutex_lock(&smc_create_lgr_pending);
634         local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0,
635                                         NULL, ismdev, aclc->gid);
636         if (local_contact < 0)
637                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
638
639         /* Create send and receive buffers */
640         if (smc_buf_create(smc, true))
641                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
642
643         smc_conn_save_peer_info(smc, aclc);
644         smc_close_init(smc);
645         smc_rx_init(smc);
646         smc_tx_init(smc);
647
648         rc = smc_clc_send_confirm(smc);
649         if (rc)
650                 return smc_connect_abort(smc, rc, local_contact);
651         mutex_unlock(&smc_create_lgr_pending);
652
653         smc_copy_sock_settings_to_clc(smc);
654         if (smc->sk.sk_state == SMC_INIT)
655                 smc->sk.sk_state = SMC_ACTIVE;
656
657         return 0;
658 }
659
660 /* perform steps before actually connecting */
661 static int __smc_connect(struct smc_sock *smc)
662 {
663         bool ism_supported = false, rdma_supported = false;
664         struct smc_clc_msg_accept_confirm aclc;
665         struct smc_ib_device *ibdev;
666         struct smcd_dev *ismdev;
667         u8 gid[SMC_GID_SIZE];
668         unsigned short vlan;
669         int smc_type;
670         int rc = 0;
671         u8 ibport;
672
673         sock_hold(&smc->sk); /* sock put in passive closing */
674
675         if (smc->use_fallback)
676                 return smc_connect_fallback(smc, smc->fallback_rsn);
677
678         /* if peer has not signalled SMC-capability, fall back */
679         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
680                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
681
682         /* IPSec connections opt out of SMC-R optimizations */
683         if (using_ipsec(smc))
684                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
685
686         /* check for VLAN ID */
687         if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
688                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
689
690         /* check if there is an ism device available */
691         if (!smc_check_ism(smc, &ismdev) &&
692             !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
693                 /* ISM is supported for this connection */
694                 ism_supported = true;
695                 smc_type = SMC_TYPE_D;
696         }
697
698         /* check if there is a rdma device available */
699         if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
700                 /* RDMA is supported for this connection */
701                 rdma_supported = true;
702                 if (ism_supported)
703                         smc_type = SMC_TYPE_B; /* both */
704                 else
705                         smc_type = SMC_TYPE_R; /* only RDMA */
706         }
707
708         /* if neither ISM nor RDMA are supported, fallback */
709         if (!rdma_supported && !ism_supported)
710                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
711
712         /* perform CLC handshake */
713         rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
714         if (rc) {
715                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
716                 return smc_connect_decline_fallback(smc, rc);
717         }
718
719         /* depending on previous steps, connect using rdma or ism */
720         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
721                 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
722         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
723                 rc = smc_connect_ism(smc, &aclc, ismdev);
724         else
725                 rc = SMC_CLC_DECL_MODEUNSUPP;
726         if (rc) {
727                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
728                 return smc_connect_decline_fallback(smc, rc);
729         }
730
731         smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
732         return 0;
733 }
734
735 static void smc_connect_work(struct work_struct *work)
736 {
737         struct smc_sock *smc = container_of(work, struct smc_sock,
738                                             connect_work);
739         int rc;
740
741         lock_sock(&smc->sk);
742         rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
743                             smc->connect_info->alen, smc->connect_info->flags);
744         if (smc->clcsock->sk->sk_err) {
745                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
746                 goto out;
747         }
748         if (rc < 0) {
749                 smc->sk.sk_err = -rc;
750                 goto out;
751         }
752
753         rc = __smc_connect(smc);
754         if (rc < 0)
755                 smc->sk.sk_err = -rc;
756
757 out:
758         if (smc->sk.sk_err)
759                 smc->sk.sk_state_change(&smc->sk);
760         else
761                 smc->sk.sk_write_space(&smc->sk);
762         kfree(smc->connect_info);
763         smc->connect_info = NULL;
764         release_sock(&smc->sk);
765 }
766
767 static int smc_connect(struct socket *sock, struct sockaddr *addr,
768                        int alen, int flags)
769 {
770         struct sock *sk = sock->sk;
771         struct smc_sock *smc;
772         int rc = -EINVAL;
773
774         smc = smc_sk(sk);
775
776         /* separate smc parameter checking to be safe */
777         if (alen < sizeof(addr->sa_family))
778                 goto out_err;
779         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
780                 goto out_err;
781
782         lock_sock(sk);
783         switch (sk->sk_state) {
784         default:
785                 goto out;
786         case SMC_ACTIVE:
787                 rc = -EISCONN;
788                 goto out;
789         case SMC_INIT:
790                 rc = 0;
791                 break;
792         }
793
794         smc_copy_sock_settings_to_clc(smc);
795         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
796         if (flags & O_NONBLOCK) {
797                 if (smc->connect_info) {
798                         rc = -EALREADY;
799                         goto out;
800                 }
801                 smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
802                 if (!smc->connect_info) {
803                         rc = -ENOMEM;
804                         goto out;
805                 }
806                 smc->connect_info->alen = alen;
807                 smc->connect_info->flags = flags ^ O_NONBLOCK;
808                 memcpy(&smc->connect_info->addr, addr, alen);
809                 schedule_work(&smc->connect_work);
810                 rc = -EINPROGRESS;
811         } else {
812                 rc = kernel_connect(smc->clcsock, addr, alen, flags);
813                 if (rc)
814                         goto out;
815
816                 rc = __smc_connect(smc);
817                 if (rc < 0)
818                         goto out;
819                 else
820                         rc = 0; /* success cases including fallback */
821         }
822
823 out:
824         release_sock(sk);
825 out_err:
826         return rc;
827 }
828
829 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
830 {
831         struct socket *new_clcsock = NULL;
832         struct sock *lsk = &lsmc->sk;
833         struct sock *new_sk;
834         int rc = -EINVAL;
835
836         release_sock(lsk);
837         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
838         if (!new_sk) {
839                 rc = -ENOMEM;
840                 lsk->sk_err = ENOMEM;
841                 *new_smc = NULL;
842                 lock_sock(lsk);
843                 goto out;
844         }
845         *new_smc = smc_sk(new_sk);
846
847         mutex_lock(&lsmc->clcsock_release_lock);
848         if (lsmc->clcsock)
849                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
850         mutex_unlock(&lsmc->clcsock_release_lock);
851         lock_sock(lsk);
852         if  (rc < 0)
853                 lsk->sk_err = -rc;
854         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
855                 if (new_clcsock)
856                         sock_release(new_clcsock);
857                 new_sk->sk_state = SMC_CLOSED;
858                 sock_set_flag(new_sk, SOCK_DEAD);
859                 new_sk->sk_prot->unhash(new_sk);
860                 sock_put(new_sk); /* final */
861                 *new_smc = NULL;
862                 goto out;
863         }
864
865         (*new_smc)->clcsock = new_clcsock;
866 out:
867         return rc;
868 }
869
870 /* add a just created sock to the accept queue of the listen sock as
871  * candidate for a following socket accept call from user space
872  */
873 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
874 {
875         struct smc_sock *par = smc_sk(parent);
876
877         sock_hold(sk); /* sock_put in smc_accept_unlink () */
878         spin_lock(&par->accept_q_lock);
879         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
880         spin_unlock(&par->accept_q_lock);
881         sk_acceptq_added(parent);
882 }
883
884 /* remove a socket from the accept queue of its parental listening socket */
885 static void smc_accept_unlink(struct sock *sk)
886 {
887         struct smc_sock *par = smc_sk(sk)->listen_smc;
888
889         spin_lock(&par->accept_q_lock);
890         list_del_init(&smc_sk(sk)->accept_q);
891         spin_unlock(&par->accept_q_lock);
892         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
893         sock_put(sk); /* sock_hold in smc_accept_enqueue */
894 }
895
896 /* remove a sock from the accept queue to bind it to a new socket created
897  * for a socket accept call from user space
898  */
899 struct sock *smc_accept_dequeue(struct sock *parent,
900                                 struct socket *new_sock)
901 {
902         struct smc_sock *isk, *n;
903         struct sock *new_sk;
904
905         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
906                 new_sk = (struct sock *)isk;
907
908                 smc_accept_unlink(new_sk);
909                 if (new_sk->sk_state == SMC_CLOSED) {
910                         if (isk->clcsock) {
911                                 sock_release(isk->clcsock);
912                                 isk->clcsock = NULL;
913                         }
914                         new_sk->sk_prot->unhash(new_sk);
915                         sock_put(new_sk); /* final */
916                         continue;
917                 }
918                 if (new_sock)
919                         sock_graft(new_sk, new_sock);
920                 return new_sk;
921         }
922         return NULL;
923 }
924
925 /* clean up for a created but never accepted sock */
926 void smc_close_non_accepted(struct sock *sk)
927 {
928         struct smc_sock *smc = smc_sk(sk);
929
930         lock_sock(sk);
931         if (!sk->sk_lingertime)
932                 /* wait for peer closing */
933                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
934         if (!smc->use_fallback) {
935                 smc_close_active(smc);
936                 sock_set_flag(sk, SOCK_DEAD);
937                 sk->sk_shutdown |= SHUTDOWN_MASK;
938         }
939         if (smc->clcsock) {
940                 struct socket *tcp;
941
942                 tcp = smc->clcsock;
943                 smc->clcsock = NULL;
944                 sock_release(tcp);
945         }
946         if (smc->use_fallback) {
947                 sock_put(sk); /* passive closing */
948                 sk->sk_state = SMC_CLOSED;
949         } else {
950                 if (sk->sk_state == SMC_CLOSED)
951                         smc_conn_free(&smc->conn);
952         }
953         release_sock(sk);
954         sk->sk_prot->unhash(sk);
955         sock_put(sk); /* final sock_put */
956 }
957
958 static int smc_serv_conf_first_link(struct smc_sock *smc)
959 {
960         struct net *net = sock_net(smc->clcsock->sk);
961         struct smc_link_group *lgr = smc->conn.lgr;
962         struct smc_link *link;
963         int rest;
964         int rc;
965
966         link = &lgr->lnk[SMC_SINGLE_LINK];
967
968         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
969                 return SMC_CLC_DECL_ERR_REGRMB;
970
971         /* send CONFIRM LINK request to client over the RoCE fabric */
972         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
973         if (rc < 0)
974                 return SMC_CLC_DECL_TIMEOUT_CL;
975
976         /* receive CONFIRM LINK response from client over the RoCE fabric */
977         rest = wait_for_completion_interruptible_timeout(
978                 &link->llc_confirm_resp,
979                 SMC_LLC_WAIT_FIRST_TIME);
980         if (rest <= 0) {
981                 struct smc_clc_msg_decline dclc;
982
983                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
984                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
985                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
986         }
987
988         if (link->llc_confirm_resp_rc)
989                 return SMC_CLC_DECL_RMBE_EC;
990
991         /* send ADD LINK request to client over the RoCE fabric */
992         rc = smc_llc_send_add_link(link,
993                                    link->smcibdev->mac[link->ibport - 1],
994                                    link->gid, SMC_LLC_REQ);
995         if (rc < 0)
996                 return SMC_CLC_DECL_TIMEOUT_AL;
997
998         /* receive ADD LINK response from client over the RoCE fabric */
999         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1000                                                          SMC_LLC_WAIT_TIME);
1001         if (rest <= 0) {
1002                 struct smc_clc_msg_decline dclc;
1003
1004                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1005                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1006                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1007         }
1008
1009         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1010
1011         return 0;
1012 }
1013
1014 /* listen worker: finish */
1015 static void smc_listen_out(struct smc_sock *new_smc)
1016 {
1017         struct smc_sock *lsmc = new_smc->listen_smc;
1018         struct sock *newsmcsk = &new_smc->sk;
1019
1020         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1021         if (lsmc->sk.sk_state == SMC_LISTEN) {
1022                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1023         } else { /* no longer listening */
1024                 smc_close_non_accepted(newsmcsk);
1025         }
1026         release_sock(&lsmc->sk);
1027
1028         /* Wake up accept */
1029         lsmc->sk.sk_data_ready(&lsmc->sk);
1030         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1031 }
1032
1033 /* listen worker: finish in state connected */
1034 static void smc_listen_out_connected(struct smc_sock *new_smc)
1035 {
1036         struct sock *newsmcsk = &new_smc->sk;
1037
1038         sk_refcnt_debug_inc(newsmcsk);
1039         if (newsmcsk->sk_state == SMC_INIT)
1040                 newsmcsk->sk_state = SMC_ACTIVE;
1041
1042         smc_listen_out(new_smc);
1043 }
1044
1045 /* listen worker: finish in error state */
1046 static void smc_listen_out_err(struct smc_sock *new_smc)
1047 {
1048         struct sock *newsmcsk = &new_smc->sk;
1049
1050         if (newsmcsk->sk_state == SMC_INIT)
1051                 sock_put(&new_smc->sk); /* passive closing */
1052         newsmcsk->sk_state = SMC_CLOSED;
1053         smc_conn_free(&new_smc->conn);
1054
1055         smc_listen_out(new_smc);
1056 }
1057
1058 /* listen worker: decline and fall back if possible */
1059 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1060                                int local_contact)
1061 {
1062         /* RDMA setup failed, switch back to TCP */
1063         if (local_contact == SMC_FIRST_CONTACT)
1064                 smc_lgr_forget(new_smc->conn.lgr);
1065         if (reason_code < 0) { /* error, no fallback possible */
1066                 smc_listen_out_err(new_smc);
1067                 return;
1068         }
1069         smc_conn_free(&new_smc->conn);
1070         new_smc->use_fallback = true;
1071         new_smc->fallback_rsn = reason_code;
1072         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1073                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1074                         smc_listen_out_err(new_smc);
1075                         return;
1076                 }
1077         }
1078         smc_listen_out_connected(new_smc);
1079 }
1080
1081 /* listen worker: check prefixes */
1082 static int smc_listen_rdma_check(struct smc_sock *new_smc,
1083                                  struct smc_clc_msg_proposal *pclc)
1084 {
1085         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1086         struct socket *newclcsock = new_smc->clcsock;
1087
1088         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1089         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1090                 return SMC_CLC_DECL_CNFERR;
1091
1092         return 0;
1093 }
1094
1095 /* listen worker: initialize connection and buffers */
1096 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1097                                 struct smc_clc_msg_proposal *pclc,
1098                                 struct smc_ib_device *ibdev, u8 ibport,
1099                                 int *local_contact)
1100 {
1101         /* allocate connection / link group */
1102         *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0,
1103                                          &pclc->lcl, NULL, 0);
1104         if (*local_contact < 0) {
1105                 if (*local_contact == -ENOMEM)
1106                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1107                 return SMC_CLC_DECL_INTERR; /* other error */
1108         }
1109
1110         /* create send buffer and rmb */
1111         if (smc_buf_create(new_smc, false))
1112                 return SMC_CLC_DECL_MEM;
1113
1114         return 0;
1115 }
1116
1117 /* listen worker: initialize connection and buffers for SMC-D */
1118 static int smc_listen_ism_init(struct smc_sock *new_smc,
1119                                struct smc_clc_msg_proposal *pclc,
1120                                struct smcd_dev *ismdev,
1121                                int *local_contact)
1122 {
1123         struct smc_clc_msg_smcd *pclc_smcd;
1124
1125         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1126         *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL,
1127                                          ismdev, pclc_smcd->gid);
1128         if (*local_contact < 0) {
1129                 if (*local_contact == -ENOMEM)
1130                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1131                 return SMC_CLC_DECL_INTERR; /* other error */
1132         }
1133
1134         /* Check if peer can be reached via ISM device */
1135         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1136                             new_smc->conn.lgr->vlan_id,
1137                             new_smc->conn.lgr->smcd)) {
1138                 if (*local_contact == SMC_FIRST_CONTACT)
1139                         smc_lgr_forget(new_smc->conn.lgr);
1140                 smc_conn_free(&new_smc->conn);
1141                 return SMC_CLC_DECL_CNFERR;
1142         }
1143
1144         /* Create send and receive buffers */
1145         if (smc_buf_create(new_smc, true)) {
1146                 if (*local_contact == SMC_FIRST_CONTACT)
1147                         smc_lgr_forget(new_smc->conn.lgr);
1148                 smc_conn_free(&new_smc->conn);
1149                 return SMC_CLC_DECL_MEM;
1150         }
1151
1152         return 0;
1153 }
1154
1155 /* listen worker: register buffers */
1156 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1157 {
1158         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1159
1160         if (local_contact != SMC_FIRST_CONTACT) {
1161                 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1162                         return SMC_CLC_DECL_ERR_REGRMB;
1163         }
1164         smc_rmb_sync_sg_for_device(&new_smc->conn);
1165
1166         return 0;
1167 }
1168
1169 /* listen worker: finish RDMA setup */
1170 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1171                                   struct smc_clc_msg_accept_confirm *cclc,
1172                                   int local_contact)
1173 {
1174         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1175         int reason_code = 0;
1176
1177         if (local_contact == SMC_FIRST_CONTACT)
1178                 smc_link_save_peer_info(link, cclc);
1179
1180         if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1181                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1182                 goto decline;
1183         }
1184
1185         if (local_contact == SMC_FIRST_CONTACT) {
1186                 if (smc_ib_ready_link(link)) {
1187                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1188                         goto decline;
1189                 }
1190                 /* QP confirmation over RoCE fabric */
1191                 reason_code = smc_serv_conf_first_link(new_smc);
1192                 if (reason_code)
1193                         goto decline;
1194         }
1195         return 0;
1196
1197 decline:
1198         smc_listen_decline(new_smc, reason_code, local_contact);
1199         return reason_code;
1200 }
1201
1202 /* setup for RDMA connection of server */
1203 static void smc_listen_work(struct work_struct *work)
1204 {
1205         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1206                                                 smc_listen_work);
1207         struct socket *newclcsock = new_smc->clcsock;
1208         struct smc_clc_msg_accept_confirm cclc;
1209         struct smc_clc_msg_proposal *pclc;
1210         struct smc_ib_device *ibdev;
1211         bool ism_supported = false;
1212         struct smcd_dev *ismdev;
1213         u8 buf[SMC_CLC_MAX_LEN];
1214         int local_contact = 0;
1215         unsigned short vlan;
1216         int reason_code = 0;
1217         int rc = 0;
1218         u8 ibport;
1219
1220         if (new_smc->use_fallback) {
1221                 smc_listen_out_connected(new_smc);
1222                 return;
1223         }
1224
1225         /* check if peer is smc capable */
1226         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1227                 new_smc->use_fallback = true;
1228                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1229                 smc_listen_out_connected(new_smc);
1230                 return;
1231         }
1232
1233         /* do inband token exchange -
1234          * wait for and receive SMC Proposal CLC message
1235          */
1236         pclc = (struct smc_clc_msg_proposal *)&buf;
1237         reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1238                                        SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1239         if (reason_code) {
1240                 smc_listen_decline(new_smc, reason_code, 0);
1241                 return;
1242         }
1243
1244         /* IPSec connections opt out of SMC-R optimizations */
1245         if (using_ipsec(new_smc)) {
1246                 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1247                 return;
1248         }
1249
1250         mutex_lock(&smc_create_lgr_pending);
1251         smc_close_init(new_smc);
1252         smc_rx_init(new_smc);
1253         smc_tx_init(new_smc);
1254
1255         /* check if ISM is available */
1256         if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
1257             !smc_check_ism(new_smc, &ismdev) &&
1258             !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
1259                 ism_supported = true;
1260         }
1261
1262         /* check if RDMA is available */
1263         if (!ism_supported &&
1264             ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
1265              smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
1266              smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
1267              smc_listen_rdma_check(new_smc, pclc) ||
1268              smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1269                                   &local_contact) ||
1270              smc_listen_rdma_reg(new_smc, local_contact))) {
1271                 /* SMC not supported, decline */
1272                 mutex_unlock(&smc_create_lgr_pending);
1273                 smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
1274                                    local_contact);
1275                 return;
1276         }
1277
1278         /* send SMC Accept CLC message */
1279         rc = smc_clc_send_accept(new_smc, local_contact);
1280         if (rc) {
1281                 mutex_unlock(&smc_create_lgr_pending);
1282                 smc_listen_decline(new_smc, rc, local_contact);
1283                 return;
1284         }
1285
1286         /* receive SMC Confirm CLC message */
1287         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1288                                        SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1289         if (reason_code) {
1290                 mutex_unlock(&smc_create_lgr_pending);
1291                 smc_listen_decline(new_smc, reason_code, local_contact);
1292                 return;
1293         }
1294
1295         /* finish worker */
1296         if (!ism_supported) {
1297                 if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) {
1298                         mutex_unlock(&smc_create_lgr_pending);
1299                         return;
1300                 }
1301         }
1302         smc_conn_save_peer_info(new_smc, &cclc);
1303         mutex_unlock(&smc_create_lgr_pending);
1304         smc_listen_out_connected(new_smc);
1305 }
1306
1307 static void smc_tcp_listen_work(struct work_struct *work)
1308 {
1309         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1310                                              tcp_listen_work);
1311         struct sock *lsk = &lsmc->sk;
1312         struct smc_sock *new_smc;
1313         int rc = 0;
1314
1315         lock_sock(lsk);
1316         while (lsk->sk_state == SMC_LISTEN) {
1317                 rc = smc_clcsock_accept(lsmc, &new_smc);
1318                 if (rc)
1319                         goto out;
1320                 if (!new_smc)
1321                         continue;
1322
1323                 new_smc->listen_smc = lsmc;
1324                 new_smc->use_fallback = lsmc->use_fallback;
1325                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1326                 sock_hold(lsk); /* sock_put in smc_listen_work */
1327                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1328                 smc_copy_sock_settings_to_smc(new_smc);
1329                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1330                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1331                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1332                 if (!schedule_work(&new_smc->smc_listen_work))
1333                         sock_put(&new_smc->sk);
1334         }
1335
1336 out:
1337         release_sock(lsk);
1338         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1339 }
1340
1341 static int smc_listen(struct socket *sock, int backlog)
1342 {
1343         struct sock *sk = sock->sk;
1344         struct smc_sock *smc;
1345         int rc;
1346
1347         smc = smc_sk(sk);
1348         lock_sock(sk);
1349
1350         rc = -EINVAL;
1351         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1352                 goto out;
1353
1354         rc = 0;
1355         if (sk->sk_state == SMC_LISTEN) {
1356                 sk->sk_max_ack_backlog = backlog;
1357                 goto out;
1358         }
1359         /* some socket options are handled in core, so we could not apply
1360          * them to the clc socket -- copy smc socket options to clc socket
1361          */
1362         smc_copy_sock_settings_to_clc(smc);
1363         if (!smc->use_fallback)
1364                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1365
1366         rc = kernel_listen(smc->clcsock, backlog);
1367         if (rc)
1368                 goto out;
1369         sk->sk_max_ack_backlog = backlog;
1370         sk->sk_ack_backlog = 0;
1371         sk->sk_state = SMC_LISTEN;
1372         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1373         if (!schedule_work(&smc->tcp_listen_work))
1374                 sock_put(sk);
1375
1376 out:
1377         release_sock(sk);
1378         return rc;
1379 }
1380
1381 static int smc_accept(struct socket *sock, struct socket *new_sock,
1382                       int flags, bool kern)
1383 {
1384         struct sock *sk = sock->sk, *nsk;
1385         DECLARE_WAITQUEUE(wait, current);
1386         struct smc_sock *lsmc;
1387         long timeo;
1388         int rc = 0;
1389
1390         lsmc = smc_sk(sk);
1391         sock_hold(sk); /* sock_put below */
1392         lock_sock(sk);
1393
1394         if (lsmc->sk.sk_state != SMC_LISTEN) {
1395                 rc = -EINVAL;
1396                 release_sock(sk);
1397                 goto out;
1398         }
1399
1400         /* Wait for an incoming connection */
1401         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1402         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1403         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1404                 set_current_state(TASK_INTERRUPTIBLE);
1405                 if (!timeo) {
1406                         rc = -EAGAIN;
1407                         break;
1408                 }
1409                 release_sock(sk);
1410                 timeo = schedule_timeout(timeo);
1411                 /* wakeup by sk_data_ready in smc_listen_work() */
1412                 sched_annotate_sleep();
1413                 lock_sock(sk);
1414                 if (signal_pending(current)) {
1415                         rc = sock_intr_errno(timeo);
1416                         break;
1417                 }
1418         }
1419         set_current_state(TASK_RUNNING);
1420         remove_wait_queue(sk_sleep(sk), &wait);
1421
1422         if (!rc)
1423                 rc = sock_error(nsk);
1424         release_sock(sk);
1425         if (rc)
1426                 goto out;
1427
1428         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1429                 /* wait till data arrives on the socket */
1430                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1431                                                                 MSEC_PER_SEC);
1432                 if (smc_sk(nsk)->use_fallback) {
1433                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1434
1435                         lock_sock(clcsk);
1436                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1437                                 sk_wait_data(clcsk, &timeo, NULL);
1438                         release_sock(clcsk);
1439                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1440                         lock_sock(nsk);
1441                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1442                         release_sock(nsk);
1443                 }
1444         }
1445
1446 out:
1447         sock_put(sk); /* sock_hold above */
1448         return rc;
1449 }
1450
1451 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1452                        int peer)
1453 {
1454         struct smc_sock *smc;
1455
1456         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1457             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1458                 return -ENOTCONN;
1459
1460         smc = smc_sk(sock->sk);
1461
1462         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1463 }
1464
1465 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1466 {
1467         struct sock *sk = sock->sk;
1468         struct smc_sock *smc;
1469         int rc = -EPIPE;
1470
1471         smc = smc_sk(sk);
1472         lock_sock(sk);
1473         if ((sk->sk_state != SMC_ACTIVE) &&
1474             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1475             (sk->sk_state != SMC_INIT))
1476                 goto out;
1477
1478         if (msg->msg_flags & MSG_FASTOPEN) {
1479                 if (sk->sk_state == SMC_INIT) {
1480                         smc->use_fallback = true;
1481                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1482                 } else {
1483                         rc = -EINVAL;
1484                         goto out;
1485                 }
1486         }
1487
1488         if (smc->use_fallback)
1489                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1490         else
1491                 rc = smc_tx_sendmsg(smc, msg, len);
1492 out:
1493         release_sock(sk);
1494         return rc;
1495 }
1496
1497 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1498                        int flags)
1499 {
1500         struct sock *sk = sock->sk;
1501         struct smc_sock *smc;
1502         int rc = -ENOTCONN;
1503
1504         smc = smc_sk(sk);
1505         lock_sock(sk);
1506         if ((sk->sk_state == SMC_INIT) ||
1507             (sk->sk_state == SMC_LISTEN) ||
1508             (sk->sk_state == SMC_CLOSED))
1509                 goto out;
1510
1511         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1512                 rc = 0;
1513                 goto out;
1514         }
1515
1516         if (smc->use_fallback) {
1517                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1518         } else {
1519                 msg->msg_namelen = 0;
1520                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1521         }
1522
1523 out:
1524         release_sock(sk);
1525         return rc;
1526 }
1527
1528 static __poll_t smc_accept_poll(struct sock *parent)
1529 {
1530         struct smc_sock *isk = smc_sk(parent);
1531         __poll_t mask = 0;
1532
1533         spin_lock(&isk->accept_q_lock);
1534         if (!list_empty(&isk->accept_q))
1535                 mask = EPOLLIN | EPOLLRDNORM;
1536         spin_unlock(&isk->accept_q_lock);
1537
1538         return mask;
1539 }
1540
1541 static __poll_t smc_poll(struct file *file, struct socket *sock,
1542                              poll_table *wait)
1543 {
1544         struct sock *sk = sock->sk;
1545         __poll_t mask = 0;
1546         struct smc_sock *smc;
1547
1548         if (!sk)
1549                 return EPOLLNVAL;
1550
1551         smc = smc_sk(sock->sk);
1552         if (smc->use_fallback) {
1553                 /* delegate to CLC child sock */
1554                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1555                 sk->sk_err = smc->clcsock->sk->sk_err;
1556                 if (sk->sk_err)
1557                         mask |= EPOLLERR;
1558         } else {
1559                 if (sk->sk_state != SMC_CLOSED)
1560                         sock_poll_wait(file, sock, wait);
1561                 if (sk->sk_err)
1562                         mask |= EPOLLERR;
1563                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1564                     (sk->sk_state == SMC_CLOSED))
1565                         mask |= EPOLLHUP;
1566                 if (sk->sk_state == SMC_LISTEN) {
1567                         /* woken up by sk_data_ready in smc_listen_work() */
1568                         mask = smc_accept_poll(sk);
1569                 } else {
1570                         if (atomic_read(&smc->conn.sndbuf_space) ||
1571                             sk->sk_shutdown & SEND_SHUTDOWN) {
1572                                 mask |= EPOLLOUT | EPOLLWRNORM;
1573                         } else {
1574                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1575                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1576                         }
1577                         if (atomic_read(&smc->conn.bytes_to_rcv))
1578                                 mask |= EPOLLIN | EPOLLRDNORM;
1579                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1580                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1581                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1582                                 mask |= EPOLLIN;
1583                         if (smc->conn.urg_state == SMC_URG_VALID)
1584                                 mask |= EPOLLPRI;
1585                 }
1586         }
1587
1588         return mask;
1589 }
1590
1591 static int smc_shutdown(struct socket *sock, int how)
1592 {
1593         struct sock *sk = sock->sk;
1594         struct smc_sock *smc;
1595         int rc = -EINVAL;
1596         int rc1 = 0;
1597
1598         smc = smc_sk(sk);
1599
1600         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1601                 return rc;
1602
1603         lock_sock(sk);
1604
1605         rc = -ENOTCONN;
1606         if ((sk->sk_state != SMC_ACTIVE) &&
1607             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1608             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1609             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1610             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1611             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1612                 goto out;
1613         if (smc->use_fallback) {
1614                 rc = kernel_sock_shutdown(smc->clcsock, how);
1615                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1616                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1617                         sk->sk_state = SMC_CLOSED;
1618                 goto out;
1619         }
1620         switch (how) {
1621         case SHUT_RDWR:         /* shutdown in both directions */
1622                 rc = smc_close_active(smc);
1623                 break;
1624         case SHUT_WR:
1625                 rc = smc_close_shutdown_write(smc);
1626                 break;
1627         case SHUT_RD:
1628                 rc = 0;
1629                 /* nothing more to do because peer is not involved */
1630                 break;
1631         }
1632         if (smc->clcsock)
1633                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1634         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1635         sk->sk_shutdown |= how + 1;
1636
1637 out:
1638         release_sock(sk);
1639         return rc ? rc : rc1;
1640 }
1641
1642 static int smc_setsockopt(struct socket *sock, int level, int optname,
1643                           char __user *optval, unsigned int optlen)
1644 {
1645         struct sock *sk = sock->sk;
1646         struct smc_sock *smc;
1647         int val, rc;
1648
1649         smc = smc_sk(sk);
1650
1651         /* generic setsockopts reaching us here always apply to the
1652          * CLC socket
1653          */
1654         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1655                                            optval, optlen);
1656         if (smc->clcsock->sk->sk_err) {
1657                 sk->sk_err = smc->clcsock->sk->sk_err;
1658                 sk->sk_error_report(sk);
1659         }
1660         if (rc)
1661                 return rc;
1662
1663         if (optlen < sizeof(int))
1664                 return -EINVAL;
1665         if (get_user(val, (int __user *)optval))
1666                 return -EFAULT;
1667
1668         lock_sock(sk);
1669         switch (optname) {
1670         case TCP_ULP:
1671         case TCP_FASTOPEN:
1672         case TCP_FASTOPEN_CONNECT:
1673         case TCP_FASTOPEN_KEY:
1674         case TCP_FASTOPEN_NO_COOKIE:
1675                 /* option not supported by SMC */
1676                 if (sk->sk_state == SMC_INIT) {
1677                         smc->use_fallback = true;
1678                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1679                 } else {
1680                         if (!smc->use_fallback)
1681                                 rc = -EINVAL;
1682                 }
1683                 break;
1684         case TCP_NODELAY:
1685                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1686                         if (val && !smc->use_fallback)
1687                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1688                                                  0);
1689                 }
1690                 break;
1691         case TCP_CORK:
1692                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1693                         if (!val && !smc->use_fallback)
1694                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1695                                                  0);
1696                 }
1697                 break;
1698         case TCP_DEFER_ACCEPT:
1699                 smc->sockopt_defer_accept = val;
1700                 break;
1701         default:
1702                 break;
1703         }
1704         release_sock(sk);
1705
1706         return rc;
1707 }
1708
1709 static int smc_getsockopt(struct socket *sock, int level, int optname,
1710                           char __user *optval, int __user *optlen)
1711 {
1712         struct smc_sock *smc;
1713
1714         smc = smc_sk(sock->sk);
1715         /* socket options apply to the CLC socket */
1716         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1717                                              optval, optlen);
1718 }
1719
1720 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1721                      unsigned long arg)
1722 {
1723         union smc_host_cursor cons, urg;
1724         struct smc_connection *conn;
1725         struct smc_sock *smc;
1726         int answ;
1727
1728         smc = smc_sk(sock->sk);
1729         conn = &smc->conn;
1730         lock_sock(&smc->sk);
1731         if (smc->use_fallback) {
1732                 if (!smc->clcsock) {
1733                         release_sock(&smc->sk);
1734                         return -EBADF;
1735                 }
1736                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1737                 release_sock(&smc->sk);
1738                 return answ;
1739         }
1740         switch (cmd) {
1741         case SIOCINQ: /* same as FIONREAD */
1742                 if (smc->sk.sk_state == SMC_LISTEN) {
1743                         release_sock(&smc->sk);
1744                         return -EINVAL;
1745                 }
1746                 if (smc->sk.sk_state == SMC_INIT ||
1747                     smc->sk.sk_state == SMC_CLOSED)
1748                         answ = 0;
1749                 else
1750                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1751                 break;
1752         case SIOCOUTQ:
1753                 /* output queue size (not send + not acked) */
1754                 if (smc->sk.sk_state == SMC_LISTEN) {
1755                         release_sock(&smc->sk);
1756                         return -EINVAL;
1757                 }
1758                 if (smc->sk.sk_state == SMC_INIT ||
1759                     smc->sk.sk_state == SMC_CLOSED)
1760                         answ = 0;
1761                 else
1762                         answ = smc->conn.sndbuf_desc->len -
1763                                         atomic_read(&smc->conn.sndbuf_space);
1764                 break;
1765         case SIOCOUTQNSD:
1766                 /* output queue size (not send only) */
1767                 if (smc->sk.sk_state == SMC_LISTEN) {
1768                         release_sock(&smc->sk);
1769                         return -EINVAL;
1770                 }
1771                 if (smc->sk.sk_state == SMC_INIT ||
1772                     smc->sk.sk_state == SMC_CLOSED)
1773                         answ = 0;
1774                 else
1775                         answ = smc_tx_prepared_sends(&smc->conn);
1776                 break;
1777         case SIOCATMARK:
1778                 if (smc->sk.sk_state == SMC_LISTEN) {
1779                         release_sock(&smc->sk);
1780                         return -EINVAL;
1781                 }
1782                 if (smc->sk.sk_state == SMC_INIT ||
1783                     smc->sk.sk_state == SMC_CLOSED) {
1784                         answ = 0;
1785                 } else {
1786                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1787                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1788                         answ = smc_curs_diff(conn->rmb_desc->len,
1789                                              &cons, &urg) == 1;
1790                 }
1791                 break;
1792         default:
1793                 release_sock(&smc->sk);
1794                 return -ENOIOCTLCMD;
1795         }
1796         release_sock(&smc->sk);
1797
1798         return put_user(answ, (int __user *)arg);
1799 }
1800
1801 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1802                             int offset, size_t size, int flags)
1803 {
1804         struct sock *sk = sock->sk;
1805         struct smc_sock *smc;
1806         int rc = -EPIPE;
1807
1808         smc = smc_sk(sk);
1809         lock_sock(sk);
1810         if (sk->sk_state != SMC_ACTIVE) {
1811                 release_sock(sk);
1812                 goto out;
1813         }
1814         release_sock(sk);
1815         if (smc->use_fallback)
1816                 rc = kernel_sendpage(smc->clcsock, page, offset,
1817                                      size, flags);
1818         else
1819                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1820
1821 out:
1822         return rc;
1823 }
1824
1825 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1826  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1827  * updates till whenever a respective page has been fully processed.
1828  * Note that subsequent recv() calls have to wait till all splice() processing
1829  * completed.
1830  */
1831 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1832                                struct pipe_inode_info *pipe, size_t len,
1833                                unsigned int flags)
1834 {
1835         struct sock *sk = sock->sk;
1836         struct smc_sock *smc;
1837         int rc = -ENOTCONN;
1838
1839         smc = smc_sk(sk);
1840         lock_sock(sk);
1841
1842         if (sk->sk_state == SMC_INIT ||
1843             sk->sk_state == SMC_LISTEN ||
1844             sk->sk_state == SMC_CLOSED)
1845                 goto out;
1846
1847         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1848                 rc = 0;
1849                 goto out;
1850         }
1851
1852         if (smc->use_fallback) {
1853                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1854                                                     pipe, len, flags);
1855         } else {
1856                 if (*ppos) {
1857                         rc = -ESPIPE;
1858                         goto out;
1859                 }
1860                 if (flags & SPLICE_F_NONBLOCK)
1861                         flags = MSG_DONTWAIT;
1862                 else
1863                         flags = 0;
1864                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1865         }
1866 out:
1867         release_sock(sk);
1868
1869         return rc;
1870 }
1871
1872 /* must look like tcp */
1873 static const struct proto_ops smc_sock_ops = {
1874         .family         = PF_SMC,
1875         .owner          = THIS_MODULE,
1876         .release        = smc_release,
1877         .bind           = smc_bind,
1878         .connect        = smc_connect,
1879         .socketpair     = sock_no_socketpair,
1880         .accept         = smc_accept,
1881         .getname        = smc_getname,
1882         .poll           = smc_poll,
1883         .ioctl          = smc_ioctl,
1884         .listen         = smc_listen,
1885         .shutdown       = smc_shutdown,
1886         .setsockopt     = smc_setsockopt,
1887         .getsockopt     = smc_getsockopt,
1888         .sendmsg        = smc_sendmsg,
1889         .recvmsg        = smc_recvmsg,
1890         .mmap           = sock_no_mmap,
1891         .sendpage       = smc_sendpage,
1892         .splice_read    = smc_splice_read,
1893 };
1894
1895 static int smc_create(struct net *net, struct socket *sock, int protocol,
1896                       int kern)
1897 {
1898         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1899         struct smc_sock *smc;
1900         struct sock *sk;
1901         int rc;
1902
1903         rc = -ESOCKTNOSUPPORT;
1904         if (sock->type != SOCK_STREAM)
1905                 goto out;
1906
1907         rc = -EPROTONOSUPPORT;
1908         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1909                 goto out;
1910
1911         rc = -ENOBUFS;
1912         sock->ops = &smc_sock_ops;
1913         sk = smc_sock_alloc(net, sock, protocol);
1914         if (!sk)
1915                 goto out;
1916
1917         /* create internal TCP socket for CLC handshake and fallback */
1918         smc = smc_sk(sk);
1919         smc->use_fallback = false; /* assume rdma capability first */
1920         smc->fallback_rsn = 0;
1921         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1922                               &smc->clcsock);
1923         if (rc) {
1924                 sk_common_release(sk);
1925                 goto out;
1926         }
1927         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1928         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1929
1930 out:
1931         return rc;
1932 }
1933
1934 static const struct net_proto_family smc_sock_family_ops = {
1935         .family = PF_SMC,
1936         .owner  = THIS_MODULE,
1937         .create = smc_create,
1938 };
1939
1940 static int __init smc_init(void)
1941 {
1942         int rc;
1943
1944         rc = smc_pnet_init();
1945         if (rc)
1946                 return rc;
1947
1948         rc = smc_llc_init();
1949         if (rc) {
1950                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1951                 goto out_pnet;
1952         }
1953
1954         rc = smc_cdc_init();
1955         if (rc) {
1956                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1957                 goto out_pnet;
1958         }
1959
1960         rc = proto_register(&smc_proto, 1);
1961         if (rc) {
1962                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1963                 goto out_pnet;
1964         }
1965
1966         rc = proto_register(&smc_proto6, 1);
1967         if (rc) {
1968                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1969                 goto out_proto;
1970         }
1971
1972         rc = sock_register(&smc_sock_family_ops);
1973         if (rc) {
1974                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1975                 goto out_proto6;
1976         }
1977         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1978         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1979
1980         rc = smc_ib_register_client();
1981         if (rc) {
1982                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1983                 goto out_sock;
1984         }
1985
1986         static_branch_enable(&tcp_have_smc);
1987         return 0;
1988
1989 out_sock:
1990         sock_unregister(PF_SMC);
1991 out_proto6:
1992         proto_unregister(&smc_proto6);
1993 out_proto:
1994         proto_unregister(&smc_proto);
1995 out_pnet:
1996         smc_pnet_exit();
1997         return rc;
1998 }
1999
2000 static void __exit smc_exit(void)
2001 {
2002         smc_core_exit();
2003         static_branch_disable(&tcp_have_smc);
2004         smc_ib_unregister_client();
2005         sock_unregister(PF_SMC);
2006         proto_unregister(&smc_proto6);
2007         proto_unregister(&smc_proto);
2008         smc_pnet_exit();
2009 }
2010
2011 module_init(smc_init);
2012 module_exit(smc_exit);
2013
2014 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2015 MODULE_DESCRIPTION("smc socket address family");
2016 MODULE_LICENSE("GPL");
2017 MODULE_ALIAS_NETPROTO(PF_SMC);