mptcp: free resources when the port number is mismatched
[linux-2.6-microblaze.git] / net / mptcp / subflow.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6
7 #define pr_fmt(fmt) "MPTCP: " fmt
8
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha2.h>
14 #include <net/sock.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
18 #include <net/tcp.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
21 #include <net/transp_v6.h>
22 #endif
23 #include <net/mptcp.h>
24 #include <uapi/linux/mptcp.h>
25 #include "protocol.h"
26 #include "mib.h"
27
28 static void mptcp_subflow_ops_undo_override(struct sock *ssk);
29
30 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
31                                   enum linux_mptcp_mib_field field)
32 {
33         MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
34 }
35
36 static void subflow_req_destructor(struct request_sock *req)
37 {
38         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
39
40         pr_debug("subflow_req=%p", subflow_req);
41
42         if (subflow_req->msk)
43                 sock_put((struct sock *)subflow_req->msk);
44
45         mptcp_token_destroy_request(req);
46         tcp_request_sock_ops.destructor(req);
47 }
48
49 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
50                                   void *hmac)
51 {
52         u8 msg[8];
53
54         put_unaligned_be32(nonce1, &msg[0]);
55         put_unaligned_be32(nonce2, &msg[4]);
56
57         mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
58 }
59
60 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
61 {
62         return mptcp_is_fully_established((void *)msk) &&
63                READ_ONCE(msk->pm.accept_subflow);
64 }
65
66 /* validate received token and create truncated hmac and nonce for SYN-ACK */
67 static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
68 {
69         struct mptcp_sock *msk = subflow_req->msk;
70         u8 hmac[SHA256_DIGEST_SIZE];
71
72         get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
73
74         subflow_generate_hmac(msk->local_key, msk->remote_key,
75                               subflow_req->local_nonce,
76                               subflow_req->remote_nonce, hmac);
77
78         subflow_req->thmac = get_unaligned_be64(hmac);
79 }
80
81 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
82 {
83         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
84         struct mptcp_sock *msk;
85         int local_id;
86
87         msk = mptcp_token_get_sock(subflow_req->token);
88         if (!msk) {
89                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
90                 return NULL;
91         }
92
93         local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
94         if (local_id < 0) {
95                 sock_put((struct sock *)msk);
96                 return NULL;
97         }
98         subflow_req->local_id = local_id;
99
100         return msk;
101 }
102
103 static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
104 {
105         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
106
107         subflow_req->mp_capable = 0;
108         subflow_req->mp_join = 0;
109         subflow_req->msk = NULL;
110         mptcp_token_init_request(req);
111 }
112
113 static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
114 {
115         return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
116 }
117
118 /* Init mptcp request socket.
119  *
120  * Returns an error code if a JOIN has failed and a TCP reset
121  * should be sent.
122  */
123 static int subflow_check_req(struct request_sock *req,
124                              const struct sock *sk_listener,
125                              struct sk_buff *skb)
126 {
127         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
128         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
129         struct mptcp_options_received mp_opt;
130
131         pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
132
133 #ifdef CONFIG_TCP_MD5SIG
134         /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
135          * TCP option space.
136          */
137         if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
138                 return -EINVAL;
139 #endif
140
141         mptcp_get_options(skb, &mp_opt);
142
143         if (mp_opt.mp_capable) {
144                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
145
146                 if (mp_opt.mp_join)
147                         return 0;
148         } else if (mp_opt.mp_join) {
149                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
150         }
151
152         if (mp_opt.mp_capable && listener->request_mptcp) {
153                 int err, retries = 4;
154
155                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
156 again:
157                 do {
158                         get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
159                 } while (subflow_req->local_key == 0);
160
161                 if (unlikely(req->syncookie)) {
162                         mptcp_crypto_key_sha(subflow_req->local_key,
163                                              &subflow_req->token,
164                                              &subflow_req->idsn);
165                         if (mptcp_token_exists(subflow_req->token)) {
166                                 if (retries-- > 0)
167                                         goto again;
168                         } else {
169                                 subflow_req->mp_capable = 1;
170                         }
171                         return 0;
172                 }
173
174                 err = mptcp_token_new_request(req);
175                 if (err == 0)
176                         subflow_req->mp_capable = 1;
177                 else if (retries-- > 0)
178                         goto again;
179
180         } else if (mp_opt.mp_join && listener->request_mptcp) {
181                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
182                 subflow_req->mp_join = 1;
183                 subflow_req->backup = mp_opt.backup;
184                 subflow_req->remote_id = mp_opt.join_id;
185                 subflow_req->token = mp_opt.token;
186                 subflow_req->remote_nonce = mp_opt.nonce;
187                 subflow_req->msk = subflow_token_join_request(req);
188
189                 /* Can't fall back to TCP in this case. */
190                 if (!subflow_req->msk)
191                         return -EPERM;
192
193                 if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
194                         pr_debug("syn inet_sport=%d %d",
195                                  ntohs(inet_sk(sk_listener)->inet_sport),
196                                  ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
197                         if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
198                                 sock_put((struct sock *)subflow_req->msk);
199                                 mptcp_token_destroy_request(req);
200                                 tcp_request_sock_ops.destructor(req);
201                                 subflow_req->msk = NULL;
202                                 subflow_req->mp_join = 0;
203                                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
204                                 return -EPERM;
205                         }
206                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
207                 }
208
209                 subflow_req_create_thmac(subflow_req);
210
211                 if (unlikely(req->syncookie)) {
212                         if (mptcp_can_accept_new_subflow(subflow_req->msk))
213                                 subflow_init_req_cookie_join_save(subflow_req, skb);
214                 }
215
216                 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
217                          subflow_req->remote_nonce, subflow_req->msk);
218         }
219
220         return 0;
221 }
222
223 int mptcp_subflow_init_cookie_req(struct request_sock *req,
224                                   const struct sock *sk_listener,
225                                   struct sk_buff *skb)
226 {
227         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
228         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
229         struct mptcp_options_received mp_opt;
230         int err;
231
232         subflow_init_req(req, sk_listener);
233         mptcp_get_options(skb, &mp_opt);
234
235         if (mp_opt.mp_capable && mp_opt.mp_join)
236                 return -EINVAL;
237
238         if (mp_opt.mp_capable && listener->request_mptcp) {
239                 if (mp_opt.sndr_key == 0)
240                         return -EINVAL;
241
242                 subflow_req->local_key = mp_opt.rcvr_key;
243                 err = mptcp_token_new_request(req);
244                 if (err)
245                         return err;
246
247                 subflow_req->mp_capable = 1;
248                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
249         } else if (mp_opt.mp_join && listener->request_mptcp) {
250                 if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
251                         return -EINVAL;
252
253                 if (mptcp_can_accept_new_subflow(subflow_req->msk))
254                         subflow_req->mp_join = 1;
255
256                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
257         }
258
259         return 0;
260 }
261 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
262
263 static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
264                                               struct sk_buff *skb,
265                                               struct flowi *fl,
266                                               struct request_sock *req)
267 {
268         struct dst_entry *dst;
269         int err;
270
271         tcp_rsk(req)->is_mptcp = 1;
272         subflow_init_req(req, sk);
273
274         dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
275         if (!dst)
276                 return NULL;
277
278         err = subflow_check_req(req, sk, skb);
279         if (err == 0)
280                 return dst;
281
282         dst_release(dst);
283         if (!req->syncookie)
284                 tcp_request_sock_ops.send_reset(sk, skb);
285         return NULL;
286 }
287
288 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
289 static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
290                                               struct sk_buff *skb,
291                                               struct flowi *fl,
292                                               struct request_sock *req)
293 {
294         struct dst_entry *dst;
295         int err;
296
297         tcp_rsk(req)->is_mptcp = 1;
298         subflow_init_req(req, sk);
299
300         dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
301         if (!dst)
302                 return NULL;
303
304         err = subflow_check_req(req, sk, skb);
305         if (err == 0)
306                 return dst;
307
308         dst_release(dst);
309         if (!req->syncookie)
310                 tcp6_request_sock_ops.send_reset(sk, skb);
311         return NULL;
312 }
313 #endif
314
315 /* validate received truncated hmac and create hmac for third ACK */
316 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
317 {
318         u8 hmac[SHA256_DIGEST_SIZE];
319         u64 thmac;
320
321         subflow_generate_hmac(subflow->remote_key, subflow->local_key,
322                               subflow->remote_nonce, subflow->local_nonce,
323                               hmac);
324
325         thmac = get_unaligned_be64(hmac);
326         pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
327                  subflow, subflow->token,
328                  (unsigned long long)thmac,
329                  (unsigned long long)subflow->thmac);
330
331         return thmac == subflow->thmac;
332 }
333
334 void mptcp_subflow_reset(struct sock *ssk)
335 {
336         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
337         struct sock *sk = subflow->conn;
338
339         /* must hold: tcp_done() could drop last reference on parent */
340         sock_hold(sk);
341
342         tcp_set_state(ssk, TCP_CLOSE);
343         tcp_send_active_reset(ssk, GFP_ATOMIC);
344         tcp_done(ssk);
345         if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
346             schedule_work(&mptcp_sk(sk)->work))
347                 return; /* worker will put sk for us */
348
349         sock_put(sk);
350 }
351
352 static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
353 {
354         return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
355 }
356
357 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
358 {
359         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
360         struct mptcp_options_received mp_opt;
361         struct sock *parent = subflow->conn;
362
363         subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
364
365         if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
366                 inet_sk_state_store(parent, TCP_ESTABLISHED);
367                 parent->sk_state_change(parent);
368         }
369
370         /* be sure no special action on any packet other than syn-ack */
371         if (subflow->conn_finished)
372                 return;
373
374         mptcp_propagate_sndbuf(parent, sk);
375         subflow->rel_write_seq = 1;
376         subflow->conn_finished = 1;
377         subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
378         pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
379
380         mptcp_get_options(skb, &mp_opt);
381         if (subflow->request_mptcp) {
382                 if (!mp_opt.mp_capable) {
383                         MPTCP_INC_STATS(sock_net(sk),
384                                         MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
385                         mptcp_do_fallback(sk);
386                         pr_fallback(mptcp_sk(subflow->conn));
387                         goto fallback;
388                 }
389
390                 subflow->mp_capable = 1;
391                 subflow->can_ack = 1;
392                 subflow->remote_key = mp_opt.sndr_key;
393                 pr_debug("subflow=%p, remote_key=%llu", subflow,
394                          subflow->remote_key);
395                 mptcp_finish_connect(sk);
396         } else if (subflow->request_join) {
397                 u8 hmac[SHA256_DIGEST_SIZE];
398
399                 if (!mp_opt.mp_join)
400                         goto do_reset;
401
402                 subflow->thmac = mp_opt.thmac;
403                 subflow->remote_nonce = mp_opt.nonce;
404                 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
405                          subflow->thmac, subflow->remote_nonce);
406
407                 if (!subflow_thmac_valid(subflow)) {
408                         MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
409                         goto do_reset;
410                 }
411
412                 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
413                                       subflow->local_nonce,
414                                       subflow->remote_nonce,
415                                       hmac);
416                 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
417
418                 if (!mptcp_finish_join(sk))
419                         goto do_reset;
420
421                 subflow->mp_join = 1;
422                 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
423
424                 if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
425                         pr_debug("synack inet_dport=%d %d",
426                                  ntohs(inet_sk(sk)->inet_dport),
427                                  ntohs(inet_sk(parent)->inet_dport));
428                         MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
429                 }
430         } else if (mptcp_check_fallback(sk)) {
431 fallback:
432                 mptcp_rcv_space_init(mptcp_sk(parent), sk);
433         }
434         return;
435
436 do_reset:
437         mptcp_subflow_reset(sk);
438 }
439
440 struct request_sock_ops mptcp_subflow_request_sock_ops;
441 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
442 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
443
444 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
445 {
446         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
447
448         pr_debug("subflow=%p", subflow);
449
450         /* Never answer to SYNs sent to broadcast or multicast */
451         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
452                 goto drop;
453
454         return tcp_conn_request(&mptcp_subflow_request_sock_ops,
455                                 &subflow_request_sock_ipv4_ops,
456                                 sk, skb);
457 drop:
458         tcp_listendrop(sk);
459         return 0;
460 }
461
462 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
463 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
464 static struct inet_connection_sock_af_ops subflow_v6_specific;
465 static struct inet_connection_sock_af_ops subflow_v6m_specific;
466 static struct proto tcpv6_prot_override;
467
468 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
469 {
470         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
471
472         pr_debug("subflow=%p", subflow);
473
474         if (skb->protocol == htons(ETH_P_IP))
475                 return subflow_v4_conn_request(sk, skb);
476
477         if (!ipv6_unicast_destination(skb))
478                 goto drop;
479
480         return tcp_conn_request(&mptcp_subflow_request_sock_ops,
481                                 &subflow_request_sock_ipv6_ops, sk, skb);
482
483 drop:
484         tcp_listendrop(sk);
485         return 0; /* don't send reset */
486 }
487 #endif
488
489 /* validate hmac received in third ACK */
490 static bool subflow_hmac_valid(const struct request_sock *req,
491                                const struct mptcp_options_received *mp_opt)
492 {
493         const struct mptcp_subflow_request_sock *subflow_req;
494         u8 hmac[SHA256_DIGEST_SIZE];
495         struct mptcp_sock *msk;
496
497         subflow_req = mptcp_subflow_rsk(req);
498         msk = subflow_req->msk;
499         if (!msk)
500                 return false;
501
502         subflow_generate_hmac(msk->remote_key, msk->local_key,
503                               subflow_req->remote_nonce,
504                               subflow_req->local_nonce, hmac);
505
506         return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
507 }
508
509 static void mptcp_sock_destruct(struct sock *sk)
510 {
511         /* if new mptcp socket isn't accepted, it is free'd
512          * from the tcp listener sockets request queue, linked
513          * from req->sk.  The tcp socket is released.
514          * This calls the ULP release function which will
515          * also remove the mptcp socket, via
516          * sock_put(ctx->conn).
517          *
518          * Problem is that the mptcp socket will be in
519          * ESTABLISHED state and will not have the SOCK_DEAD flag.
520          * Both result in warnings from inet_sock_destruct.
521          */
522
523         if (sk->sk_state == TCP_ESTABLISHED) {
524                 sk->sk_state = TCP_CLOSE;
525                 WARN_ON_ONCE(sk->sk_socket);
526                 sock_orphan(sk);
527         }
528
529         mptcp_destroy_common(mptcp_sk(sk));
530         inet_sock_destruct(sk);
531 }
532
533 static void mptcp_force_close(struct sock *sk)
534 {
535         inet_sk_state_store(sk, TCP_CLOSE);
536         sk_common_release(sk);
537 }
538
539 static void subflow_ulp_fallback(struct sock *sk,
540                                  struct mptcp_subflow_context *old_ctx)
541 {
542         struct inet_connection_sock *icsk = inet_csk(sk);
543
544         mptcp_subflow_tcp_fallback(sk, old_ctx);
545         icsk->icsk_ulp_ops = NULL;
546         rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
547         tcp_sk(sk)->is_mptcp = 0;
548
549         mptcp_subflow_ops_undo_override(sk);
550 }
551
552 static void subflow_drop_ctx(struct sock *ssk)
553 {
554         struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
555
556         if (!ctx)
557                 return;
558
559         subflow_ulp_fallback(ssk, ctx);
560         if (ctx->conn)
561                 sock_put(ctx->conn);
562
563         kfree_rcu(ctx, rcu);
564 }
565
566 void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
567                                      struct mptcp_options_received *mp_opt)
568 {
569         struct mptcp_sock *msk = mptcp_sk(subflow->conn);
570
571         subflow->remote_key = mp_opt->sndr_key;
572         subflow->fully_established = 1;
573         subflow->can_ack = 1;
574         WRITE_ONCE(msk->fully_established, true);
575 }
576
577 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
578                                           struct sk_buff *skb,
579                                           struct request_sock *req,
580                                           struct dst_entry *dst,
581                                           struct request_sock *req_unhash,
582                                           bool *own_req)
583 {
584         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
585         struct mptcp_subflow_request_sock *subflow_req;
586         struct mptcp_options_received mp_opt;
587         bool fallback, fallback_is_fatal;
588         struct sock *new_msk = NULL;
589         struct sock *child;
590
591         pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
592
593         /* After child creation we must look for 'mp_capable' even when options
594          * are not parsed
595          */
596         mp_opt.mp_capable = 0;
597
598         /* hopefully temporary handling for MP_JOIN+syncookie */
599         subflow_req = mptcp_subflow_rsk(req);
600         fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
601         fallback = !tcp_rsk(req)->is_mptcp;
602         if (fallback)
603                 goto create_child;
604
605         /* if the sk is MP_CAPABLE, we try to fetch the client key */
606         if (subflow_req->mp_capable) {
607                 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
608                         /* here we can receive and accept an in-window,
609                          * out-of-order pkt, which will not carry the MP_CAPABLE
610                          * opt even on mptcp enabled paths
611                          */
612                         goto create_msk;
613                 }
614
615                 mptcp_get_options(skb, &mp_opt);
616                 if (!mp_opt.mp_capable) {
617                         fallback = true;
618                         goto create_child;
619                 }
620
621 create_msk:
622                 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
623                 if (!new_msk)
624                         fallback = true;
625         } else if (subflow_req->mp_join) {
626                 mptcp_get_options(skb, &mp_opt);
627                 if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) ||
628                     !mptcp_can_accept_new_subflow(subflow_req->msk)) {
629                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
630                         fallback = true;
631                 }
632         }
633
634 create_child:
635         child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
636                                                      req_unhash, own_req);
637
638         if (child && *own_req) {
639                 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
640
641                 tcp_rsk(req)->drop_req = false;
642
643                 /* we need to fallback on ctx allocation failure and on pre-reqs
644                  * checking above. In the latter scenario we additionally need
645                  * to reset the context to non MPTCP status.
646                  */
647                 if (!ctx || fallback) {
648                         if (fallback_is_fatal)
649                                 goto dispose_child;
650
651                         subflow_drop_ctx(child);
652                         goto out;
653                 }
654
655                 if (ctx->mp_capable) {
656                         /* this can't race with mptcp_close(), as the msk is
657                          * not yet exposted to user-space
658                          */
659                         inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
660
661                         /* record the newly created socket as the first msk
662                          * subflow, but don't link it yet into conn_list
663                          */
664                         WRITE_ONCE(mptcp_sk(new_msk)->first, child);
665
666                         /* new mpc subflow takes ownership of the newly
667                          * created mptcp socket
668                          */
669                         new_msk->sk_destruct = mptcp_sock_destruct;
670                         mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
671                         mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
672                         ctx->conn = new_msk;
673                         new_msk = NULL;
674
675                         /* with OoO packets we can reach here without ingress
676                          * mpc option
677                          */
678                         if (mp_opt.mp_capable)
679                                 mptcp_subflow_fully_established(ctx, &mp_opt);
680                 } else if (ctx->mp_join) {
681                         struct mptcp_sock *owner;
682
683                         owner = subflow_req->msk;
684                         if (!owner)
685                                 goto dispose_child;
686
687                         /* move the msk reference ownership to the subflow */
688                         subflow_req->msk = NULL;
689                         ctx->conn = (struct sock *)owner;
690
691                         if (subflow_use_different_sport(owner, sk)) {
692                                 pr_debug("ack inet_sport=%d %d",
693                                          ntohs(inet_sk(sk)->inet_sport),
694                                          ntohs(inet_sk((struct sock *)owner)->inet_sport));
695                                 if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
696                                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
697                                         goto dispose_child;
698                                 }
699                                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
700                         }
701
702                         if (!mptcp_finish_join(child))
703                                 goto dispose_child;
704
705                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
706                         tcp_rsk(req)->drop_req = true;
707                 }
708         }
709
710 out:
711         /* dispose of the left over mptcp master, if any */
712         if (unlikely(new_msk))
713                 mptcp_force_close(new_msk);
714
715         /* check for expected invariant - should never trigger, just help
716          * catching eariler subtle bugs
717          */
718         WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
719                      (!mptcp_subflow_ctx(child) ||
720                       !mptcp_subflow_ctx(child)->conn));
721         return child;
722
723 dispose_child:
724         subflow_drop_ctx(child);
725         tcp_rsk(req)->drop_req = true;
726         inet_csk_prepare_for_destroy_sock(child);
727         tcp_done(child);
728         req->rsk_ops->send_reset(sk, skb);
729
730         /* The last child reference will be released by the caller */
731         return child;
732 }
733
734 static struct inet_connection_sock_af_ops subflow_specific;
735 static struct proto tcp_prot_override;
736
737 enum mapping_status {
738         MAPPING_OK,
739         MAPPING_INVALID,
740         MAPPING_EMPTY,
741         MAPPING_DATA_FIN,
742         MAPPING_DUMMY
743 };
744
745 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
746 {
747         if ((u32)seq == (u32)old_seq)
748                 return old_seq;
749
750         /* Assume map covers data not mapped yet. */
751         return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
752 }
753
754 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
755 {
756         WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
757                   ssn, subflow->map_subflow_seq, subflow->map_data_len);
758 }
759
760 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
761 {
762         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
763         unsigned int skb_consumed;
764
765         skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
766         if (WARN_ON_ONCE(skb_consumed >= skb->len))
767                 return true;
768
769         return skb->len - skb_consumed <= subflow->map_data_len -
770                                           mptcp_subflow_get_map_offset(subflow);
771 }
772
773 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
774 {
775         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
776         u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
777
778         if (unlikely(before(ssn, subflow->map_subflow_seq))) {
779                 /* Mapping covers data later in the subflow stream,
780                  * currently unsupported.
781                  */
782                 warn_bad_map(subflow, ssn);
783                 return false;
784         }
785         if (unlikely(!before(ssn, subflow->map_subflow_seq +
786                                   subflow->map_data_len))) {
787                 /* Mapping does covers past subflow data, invalid */
788                 warn_bad_map(subflow, ssn + skb->len);
789                 return false;
790         }
791         return true;
792 }
793
794 static enum mapping_status get_mapping_status(struct sock *ssk,
795                                               struct mptcp_sock *msk)
796 {
797         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
798         struct mptcp_ext *mpext;
799         struct sk_buff *skb;
800         u16 data_len;
801         u64 map_seq;
802
803         skb = skb_peek(&ssk->sk_receive_queue);
804         if (!skb)
805                 return MAPPING_EMPTY;
806
807         if (mptcp_check_fallback(ssk))
808                 return MAPPING_DUMMY;
809
810         mpext = mptcp_get_ext(skb);
811         if (!mpext || !mpext->use_map) {
812                 if (!subflow->map_valid && !skb->len) {
813                         /* the TCP stack deliver 0 len FIN pkt to the receive
814                          * queue, that is the only 0len pkts ever expected here,
815                          * and we can admit no mapping only for 0 len pkts
816                          */
817                         if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
818                                 WARN_ONCE(1, "0len seq %d:%d flags %x",
819                                           TCP_SKB_CB(skb)->seq,
820                                           TCP_SKB_CB(skb)->end_seq,
821                                           TCP_SKB_CB(skb)->tcp_flags);
822                         sk_eat_skb(ssk, skb);
823                         return MAPPING_EMPTY;
824                 }
825
826                 if (!subflow->map_valid)
827                         return MAPPING_INVALID;
828
829                 goto validate_seq;
830         }
831
832         pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
833                  mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
834                  mpext->data_len, mpext->data_fin);
835
836         data_len = mpext->data_len;
837         if (data_len == 0) {
838                 pr_err("Infinite mapping not handled");
839                 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
840                 return MAPPING_INVALID;
841         }
842
843         if (mpext->data_fin == 1) {
844                 if (data_len == 1) {
845                         bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
846                                                                  mpext->dsn64);
847                         pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
848                         if (subflow->map_valid) {
849                                 /* A DATA_FIN might arrive in a DSS
850                                  * option before the previous mapping
851                                  * has been fully consumed. Continue
852                                  * handling the existing mapping.
853                                  */
854                                 skb_ext_del(skb, SKB_EXT_MPTCP);
855                                 return MAPPING_OK;
856                         } else {
857                                 if (updated && schedule_work(&msk->work))
858                                         sock_hold((struct sock *)msk);
859
860                                 return MAPPING_DATA_FIN;
861                         }
862                 } else {
863                         u64 data_fin_seq = mpext->data_seq + data_len - 1;
864
865                         /* If mpext->data_seq is a 32-bit value, data_fin_seq
866                          * must also be limited to 32 bits.
867                          */
868                         if (!mpext->dsn64)
869                                 data_fin_seq &= GENMASK_ULL(31, 0);
870
871                         mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
872                         pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
873                                  data_fin_seq, mpext->dsn64);
874                 }
875
876                 /* Adjust for DATA_FIN using 1 byte of sequence space */
877                 data_len--;
878         }
879
880         if (!mpext->dsn64) {
881                 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
882                                      mpext->data_seq);
883                 pr_debug("expanded seq=%llu", subflow->map_seq);
884         } else {
885                 map_seq = mpext->data_seq;
886         }
887         WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
888
889         if (subflow->map_valid) {
890                 /* Allow replacing only with an identical map */
891                 if (subflow->map_seq == map_seq &&
892                     subflow->map_subflow_seq == mpext->subflow_seq &&
893                     subflow->map_data_len == data_len) {
894                         skb_ext_del(skb, SKB_EXT_MPTCP);
895                         return MAPPING_OK;
896                 }
897
898                 /* If this skb data are fully covered by the current mapping,
899                  * the new map would need caching, which is not supported
900                  */
901                 if (skb_is_fully_mapped(ssk, skb)) {
902                         MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
903                         return MAPPING_INVALID;
904                 }
905
906                 /* will validate the next map after consuming the current one */
907                 return MAPPING_OK;
908         }
909
910         subflow->map_seq = map_seq;
911         subflow->map_subflow_seq = mpext->subflow_seq;
912         subflow->map_data_len = data_len;
913         subflow->map_valid = 1;
914         subflow->mpc_map = mpext->mpc_map;
915         pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
916                  subflow->map_seq, subflow->map_subflow_seq,
917                  subflow->map_data_len);
918
919 validate_seq:
920         /* we revalidate valid mapping on new skb, because we must ensure
921          * the current skb is completely covered by the available mapping
922          */
923         if (!validate_mapping(ssk, skb))
924                 return MAPPING_INVALID;
925
926         skb_ext_del(skb, SKB_EXT_MPTCP);
927         return MAPPING_OK;
928 }
929
930 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
931                                        u64 limit)
932 {
933         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
934         bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
935         u32 incr;
936
937         incr = limit >= skb->len ? skb->len + fin : limit;
938
939         pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
940                  subflow->map_subflow_seq);
941         MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
942         tcp_sk(ssk)->copied_seq += incr;
943         if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
944                 sk_eat_skb(ssk, skb);
945         if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
946                 subflow->map_valid = 0;
947 }
948
949 /* sched mptcp worker to remove the subflow if no more data is pending */
950 static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
951 {
952         struct sock *sk = (struct sock *)msk;
953
954         if (likely(ssk->sk_state != TCP_CLOSE))
955                 return;
956
957         if (skb_queue_empty(&ssk->sk_receive_queue) &&
958             !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) {
959                 sock_hold(sk);
960                 if (!schedule_work(&msk->work))
961                         sock_put(sk);
962         }
963 }
964
965 static bool subflow_check_data_avail(struct sock *ssk)
966 {
967         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
968         enum mapping_status status;
969         struct mptcp_sock *msk;
970         struct sk_buff *skb;
971
972         pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
973                  subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
974         if (!skb_peek(&ssk->sk_receive_queue))
975                 subflow->data_avail = 0;
976         if (subflow->data_avail)
977                 return true;
978
979         msk = mptcp_sk(subflow->conn);
980         for (;;) {
981                 u64 ack_seq;
982                 u64 old_ack;
983
984                 status = get_mapping_status(ssk, msk);
985                 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
986                 if (status == MAPPING_INVALID) {
987                         ssk->sk_err = EBADMSG;
988                         goto fatal;
989                 }
990                 if (status == MAPPING_DUMMY) {
991                         __mptcp_do_fallback(msk);
992                         skb = skb_peek(&ssk->sk_receive_queue);
993                         subflow->map_valid = 1;
994                         subflow->map_seq = READ_ONCE(msk->ack_seq);
995                         subflow->map_data_len = skb->len;
996                         subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
997                                                    subflow->ssn_offset;
998                         subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
999                         return true;
1000                 }
1001
1002                 if (status != MAPPING_OK)
1003                         goto no_data;
1004
1005                 skb = skb_peek(&ssk->sk_receive_queue);
1006                 if (WARN_ON_ONCE(!skb))
1007                         goto no_data;
1008
1009                 /* if msk lacks the remote key, this subflow must provide an
1010                  * MP_CAPABLE-based mapping
1011                  */
1012                 if (unlikely(!READ_ONCE(msk->can_ack))) {
1013                         if (!subflow->mpc_map) {
1014                                 ssk->sk_err = EBADMSG;
1015                                 goto fatal;
1016                         }
1017                         WRITE_ONCE(msk->remote_key, subflow->remote_key);
1018                         WRITE_ONCE(msk->ack_seq, subflow->map_seq);
1019                         WRITE_ONCE(msk->can_ack, true);
1020                 }
1021
1022                 old_ack = READ_ONCE(msk->ack_seq);
1023                 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
1024                 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
1025                          ack_seq);
1026                 if (ack_seq == old_ack) {
1027                         subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
1028                         break;
1029                 } else if (after64(ack_seq, old_ack)) {
1030                         subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
1031                         break;
1032                 }
1033
1034                 /* only accept in-sequence mapping. Old values are spurious
1035                  * retransmission
1036                  */
1037                 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
1038         }
1039         return true;
1040
1041 no_data:
1042         subflow_sched_work_if_closed(msk, ssk);
1043         return false;
1044 fatal:
1045         /* fatal protocol error, close the socket */
1046         /* This barrier is coupled with smp_rmb() in tcp_poll() */
1047         smp_wmb();
1048         ssk->sk_error_report(ssk);
1049         tcp_set_state(ssk, TCP_CLOSE);
1050         tcp_send_active_reset(ssk, GFP_ATOMIC);
1051         subflow->data_avail = 0;
1052         return false;
1053 }
1054
1055 bool mptcp_subflow_data_available(struct sock *sk)
1056 {
1057         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1058
1059         /* check if current mapping is still valid */
1060         if (subflow->map_valid &&
1061             mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
1062                 subflow->map_valid = 0;
1063                 subflow->data_avail = 0;
1064
1065                 pr_debug("Done with mapping: seq=%u data_len=%u",
1066                          subflow->map_subflow_seq,
1067                          subflow->map_data_len);
1068         }
1069
1070         return subflow_check_data_avail(sk);
1071 }
1072
1073 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
1074  * not the ssk one.
1075  *
1076  * In mptcp, rwin is about the mptcp-level connection data.
1077  *
1078  * Data that is still on the ssk rx queue can thus be ignored,
1079  * as far as mptcp peer is concerened that data is still inflight.
1080  * DSS ACK is updated when skb is moved to the mptcp rx queue.
1081  */
1082 void mptcp_space(const struct sock *ssk, int *space, int *full_space)
1083 {
1084         const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1085         const struct sock *sk = subflow->conn;
1086
1087         *space = __mptcp_space(sk);
1088         *full_space = tcp_full_space(sk);
1089 }
1090
1091 static void subflow_data_ready(struct sock *sk)
1092 {
1093         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1094         u16 state = 1 << inet_sk_state_load(sk);
1095         struct sock *parent = subflow->conn;
1096         struct mptcp_sock *msk;
1097
1098         msk = mptcp_sk(parent);
1099         if (state & TCPF_LISTEN) {
1100                 /* MPJ subflow are removed from accept queue before reaching here,
1101                  * avoid stray wakeups
1102                  */
1103                 if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
1104                         return;
1105
1106                 set_bit(MPTCP_DATA_READY, &msk->flags);
1107                 parent->sk_data_ready(parent);
1108                 return;
1109         }
1110
1111         WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
1112                      !subflow->mp_join && !(state & TCPF_CLOSE));
1113
1114         if (mptcp_subflow_data_available(sk))
1115                 mptcp_data_ready(parent, sk);
1116 }
1117
1118 static void subflow_write_space(struct sock *ssk)
1119 {
1120         struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
1121
1122         mptcp_propagate_sndbuf(sk, ssk);
1123         mptcp_write_space(sk);
1124 }
1125
1126 void __mptcp_error_report(struct sock *sk)
1127 {
1128         struct mptcp_subflow_context *subflow;
1129         struct mptcp_sock *msk = mptcp_sk(sk);
1130
1131         mptcp_for_each_subflow(msk, subflow) {
1132                 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1133                 int err = sock_error(ssk);
1134
1135                 if (!err)
1136                         continue;
1137
1138                 /* only propagate errors on fallen-back sockets or
1139                  * on MPC connect
1140                  */
1141                 if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
1142                         continue;
1143
1144                 inet_sk_state_store(sk, inet_sk_state_load(ssk));
1145                 sk->sk_err = -err;
1146
1147                 /* This barrier is coupled with smp_rmb() in mptcp_poll() */
1148                 smp_wmb();
1149                 sk->sk_error_report(sk);
1150                 break;
1151         }
1152 }
1153
1154 static void subflow_error_report(struct sock *ssk)
1155 {
1156         struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
1157
1158         mptcp_data_lock(sk);
1159         if (!sock_owned_by_user(sk))
1160                 __mptcp_error_report(sk);
1161         else
1162                 set_bit(MPTCP_ERROR_REPORT,  &mptcp_sk(sk)->flags);
1163         mptcp_data_unlock(sk);
1164 }
1165
1166 static struct inet_connection_sock_af_ops *
1167 subflow_default_af_ops(struct sock *sk)
1168 {
1169 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1170         if (sk->sk_family == AF_INET6)
1171                 return &subflow_v6_specific;
1172 #endif
1173         return &subflow_specific;
1174 }
1175
1176 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1177 void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
1178 {
1179         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1180         struct inet_connection_sock *icsk = inet_csk(sk);
1181         struct inet_connection_sock_af_ops *target;
1182
1183         target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
1184
1185         pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
1186                  subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
1187
1188         if (likely(icsk->icsk_af_ops == target))
1189                 return;
1190
1191         subflow->icsk_af_ops = icsk->icsk_af_ops;
1192         icsk->icsk_af_ops = target;
1193 }
1194 #endif
1195
1196 void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
1197                          struct sockaddr_storage *addr,
1198                          unsigned short family)
1199 {
1200         memset(addr, 0, sizeof(*addr));
1201         addr->ss_family = family;
1202         if (addr->ss_family == AF_INET) {
1203                 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
1204
1205                 if (info->family == AF_INET)
1206                         in_addr->sin_addr = info->addr;
1207 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1208                 else if (ipv6_addr_v4mapped(&info->addr6))
1209                         in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
1210 #endif
1211                 in_addr->sin_port = info->port;
1212         }
1213 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1214         else if (addr->ss_family == AF_INET6) {
1215                 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
1216
1217                 if (info->family == AF_INET)
1218                         ipv6_addr_set_v4mapped(info->addr.s_addr,
1219                                                &in6_addr->sin6_addr);
1220                 else
1221                         in6_addr->sin6_addr = info->addr6;
1222                 in6_addr->sin6_port = info->port;
1223         }
1224 #endif
1225 }
1226
1227 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
1228                             const struct mptcp_addr_info *remote)
1229 {
1230         struct mptcp_sock *msk = mptcp_sk(sk);
1231         struct mptcp_subflow_context *subflow;
1232         struct sockaddr_storage addr;
1233         int remote_id = remote->id;
1234         int local_id = loc->id;
1235         struct socket *sf;
1236         struct sock *ssk;
1237         u32 remote_token;
1238         int addrlen;
1239         int err;
1240
1241         if (!mptcp_is_fully_established(sk))
1242                 return -ENOTCONN;
1243
1244         err = mptcp_subflow_create_socket(sk, &sf);
1245         if (err)
1246                 return err;
1247
1248         ssk = sf->sk;
1249         subflow = mptcp_subflow_ctx(ssk);
1250         do {
1251                 get_random_bytes(&subflow->local_nonce, sizeof(u32));
1252         } while (!subflow->local_nonce);
1253
1254         if (!local_id) {
1255                 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
1256                 if (err < 0)
1257                         goto failed;
1258
1259                 local_id = err;
1260         }
1261
1262         subflow->remote_key = msk->remote_key;
1263         subflow->local_key = msk->local_key;
1264         subflow->token = msk->token;
1265         mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
1266
1267         addrlen = sizeof(struct sockaddr_in);
1268 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1269         if (addr.ss_family == AF_INET6)
1270                 addrlen = sizeof(struct sockaddr_in6);
1271 #endif
1272         ssk->sk_bound_dev_if = loc->ifindex;
1273         err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
1274         if (err)
1275                 goto failed;
1276
1277         mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
1278         pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
1279                  remote_token, local_id, remote_id);
1280         subflow->remote_token = remote_token;
1281         subflow->local_id = local_id;
1282         subflow->remote_id = remote_id;
1283         subflow->request_join = 1;
1284         subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
1285         mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
1286
1287         mptcp_add_pending_subflow(msk, subflow);
1288         err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
1289         if (err && err != -EINPROGRESS)
1290                 goto failed_unlink;
1291
1292         /* discard the subflow socket */
1293         mptcp_sock_graft(ssk, sk->sk_socket);
1294         iput(SOCK_INODE(sf));
1295         return err;
1296
1297 failed_unlink:
1298         spin_lock_bh(&msk->join_list_lock);
1299         list_del(&subflow->node);
1300         spin_unlock_bh(&msk->join_list_lock);
1301         sock_put(mptcp_subflow_tcp_sock(subflow));
1302
1303 failed:
1304         subflow->disposable = 1;
1305         sock_release(sf);
1306         return err;
1307 }
1308
1309 static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
1310 {
1311 #ifdef CONFIG_SOCK_CGROUP_DATA
1312         struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
1313                                 *child_skcd = &child->sk_cgrp_data;
1314
1315         /* only the additional subflows created by kworkers have to be modified */
1316         if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
1317             cgroup_id(sock_cgroup_ptr(child_skcd))) {
1318 #ifdef CONFIG_MEMCG
1319                 struct mem_cgroup *memcg = parent->sk_memcg;
1320
1321                 mem_cgroup_sk_free(child);
1322                 if (memcg && css_tryget(&memcg->css))
1323                         child->sk_memcg = memcg;
1324 #endif /* CONFIG_MEMCG */
1325
1326                 cgroup_sk_free(child_skcd);
1327                 *child_skcd = *parent_skcd;
1328                 cgroup_sk_clone(child_skcd);
1329         }
1330 #endif /* CONFIG_SOCK_CGROUP_DATA */
1331 }
1332
1333 static void mptcp_subflow_ops_override(struct sock *ssk)
1334 {
1335 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1336         if (ssk->sk_prot == &tcpv6_prot)
1337                 ssk->sk_prot = &tcpv6_prot_override;
1338         else
1339 #endif
1340                 ssk->sk_prot = &tcp_prot_override;
1341 }
1342
1343 static void mptcp_subflow_ops_undo_override(struct sock *ssk)
1344 {
1345 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1346         if (ssk->sk_prot == &tcpv6_prot_override)
1347                 ssk->sk_prot = &tcpv6_prot;
1348         else
1349 #endif
1350                 ssk->sk_prot = &tcp_prot;
1351 }
1352 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
1353 {
1354         struct mptcp_subflow_context *subflow;
1355         struct net *net = sock_net(sk);
1356         struct socket *sf;
1357         int err;
1358
1359         /* un-accepted server sockets can reach here - on bad configuration
1360          * bail early to avoid greater trouble later
1361          */
1362         if (unlikely(!sk->sk_socket))
1363                 return -EINVAL;
1364
1365         err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
1366                                &sf);
1367         if (err)
1368                 return err;
1369
1370         lock_sock(sf->sk);
1371
1372         /* the newly created socket has to be in the same cgroup as its parent */
1373         mptcp_attach_cgroup(sk, sf->sk);
1374
1375         /* kernel sockets do not by default acquire net ref, but TCP timer
1376          * needs it.
1377          */
1378         sf->sk->sk_net_refcnt = 1;
1379         get_net(net);
1380 #ifdef CONFIG_PROC_FS
1381         this_cpu_add(*net->core.sock_inuse, 1);
1382 #endif
1383         err = tcp_set_ulp(sf->sk, "mptcp");
1384         release_sock(sf->sk);
1385
1386         if (err) {
1387                 sock_release(sf);
1388                 return err;
1389         }
1390
1391         /* the newly created socket really belongs to the owning MPTCP master
1392          * socket, even if for additional subflows the allocation is performed
1393          * by a kernel workqueue. Adjust inode references, so that the
1394          * procfs/diag interaces really show this one belonging to the correct
1395          * user.
1396          */
1397         SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1398         SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1399         SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1400
1401         subflow = mptcp_subflow_ctx(sf->sk);
1402         pr_debug("subflow=%p", subflow);
1403
1404         *new_sock = sf;
1405         sock_hold(sk);
1406         subflow->conn = sk;
1407         mptcp_subflow_ops_override(sf->sk);
1408
1409         return 0;
1410 }
1411
1412 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1413                                                         gfp_t priority)
1414 {
1415         struct inet_connection_sock *icsk = inet_csk(sk);
1416         struct mptcp_subflow_context *ctx;
1417
1418         ctx = kzalloc(sizeof(*ctx), priority);
1419         if (!ctx)
1420                 return NULL;
1421
1422         rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
1423         INIT_LIST_HEAD(&ctx->node);
1424         INIT_LIST_HEAD(&ctx->delegated_node);
1425
1426         pr_debug("subflow=%p", ctx);
1427
1428         ctx->tcp_sock = sk;
1429
1430         return ctx;
1431 }
1432
1433 static void __subflow_state_change(struct sock *sk)
1434 {
1435         struct socket_wq *wq;
1436
1437         rcu_read_lock();
1438         wq = rcu_dereference(sk->sk_wq);
1439         if (skwq_has_sleeper(wq))
1440                 wake_up_interruptible_all(&wq->wait);
1441         rcu_read_unlock();
1442 }
1443
1444 static bool subflow_is_done(const struct sock *sk)
1445 {
1446         return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1447 }
1448
1449 static void subflow_state_change(struct sock *sk)
1450 {
1451         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1452         struct sock *parent = subflow->conn;
1453
1454         __subflow_state_change(sk);
1455
1456         if (subflow_simultaneous_connect(sk)) {
1457                 mptcp_propagate_sndbuf(parent, sk);
1458                 mptcp_do_fallback(sk);
1459                 mptcp_rcv_space_init(mptcp_sk(parent), sk);
1460                 pr_fallback(mptcp_sk(parent));
1461                 subflow->conn_finished = 1;
1462                 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
1463                         inet_sk_state_store(parent, TCP_ESTABLISHED);
1464                         parent->sk_state_change(parent);
1465                 }
1466         }
1467
1468         /* as recvmsg() does not acquire the subflow socket for ssk selection
1469          * a fin packet carrying a DSS can be unnoticed if we don't trigger
1470          * the data available machinery here.
1471          */
1472         if (mptcp_subflow_data_available(sk))
1473                 mptcp_data_ready(parent, sk);
1474
1475         subflow_sched_work_if_closed(mptcp_sk(parent), sk);
1476
1477         if (__mptcp_check_fallback(mptcp_sk(parent)) &&
1478             !subflow->rx_eof && subflow_is_done(sk)) {
1479                 subflow->rx_eof = 1;
1480                 mptcp_subflow_eof(parent);
1481         }
1482 }
1483
1484 static int subflow_ulp_init(struct sock *sk)
1485 {
1486         struct inet_connection_sock *icsk = inet_csk(sk);
1487         struct mptcp_subflow_context *ctx;
1488         struct tcp_sock *tp = tcp_sk(sk);
1489         int err = 0;
1490
1491         /* disallow attaching ULP to a socket unless it has been
1492          * created with sock_create_kern()
1493          */
1494         if (!sk->sk_kern_sock) {
1495                 err = -EOPNOTSUPP;
1496                 goto out;
1497         }
1498
1499         ctx = subflow_create_ctx(sk, GFP_KERNEL);
1500         if (!ctx) {
1501                 err = -ENOMEM;
1502                 goto out;
1503         }
1504
1505         pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1506
1507         tp->is_mptcp = 1;
1508         ctx->icsk_af_ops = icsk->icsk_af_ops;
1509         icsk->icsk_af_ops = subflow_default_af_ops(sk);
1510         ctx->tcp_data_ready = sk->sk_data_ready;
1511         ctx->tcp_state_change = sk->sk_state_change;
1512         ctx->tcp_write_space = sk->sk_write_space;
1513         ctx->tcp_error_report = sk->sk_error_report;
1514         sk->sk_data_ready = subflow_data_ready;
1515         sk->sk_write_space = subflow_write_space;
1516         sk->sk_state_change = subflow_state_change;
1517         sk->sk_error_report = subflow_error_report;
1518 out:
1519         return err;
1520 }
1521
1522 static void subflow_ulp_release(struct sock *ssk)
1523 {
1524         struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
1525         bool release = true;
1526         struct sock *sk;
1527
1528         if (!ctx)
1529                 return;
1530
1531         sk = ctx->conn;
1532         if (sk) {
1533                 /* if the msk has been orphaned, keep the ctx
1534                  * alive, will be freed by __mptcp_close_ssk(),
1535                  * when the subflow is still unaccepted
1536                  */
1537                 release = ctx->disposable || list_empty(&ctx->node);
1538                 sock_put(sk);
1539         }
1540
1541         mptcp_subflow_ops_undo_override(ssk);
1542         if (release)
1543                 kfree_rcu(ctx, rcu);
1544 }
1545
1546 static void subflow_ulp_clone(const struct request_sock *req,
1547                               struct sock *newsk,
1548                               const gfp_t priority)
1549 {
1550         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1551         struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1552         struct mptcp_subflow_context *new_ctx;
1553
1554         if (!tcp_rsk(req)->is_mptcp ||
1555             (!subflow_req->mp_capable && !subflow_req->mp_join)) {
1556                 subflow_ulp_fallback(newsk, old_ctx);
1557                 return;
1558         }
1559
1560         new_ctx = subflow_create_ctx(newsk, priority);
1561         if (!new_ctx) {
1562                 subflow_ulp_fallback(newsk, old_ctx);
1563                 return;
1564         }
1565
1566         new_ctx->conn_finished = 1;
1567         new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
1568         new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1569         new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1570         new_ctx->tcp_write_space = old_ctx->tcp_write_space;
1571         new_ctx->tcp_error_report = old_ctx->tcp_error_report;
1572         new_ctx->rel_write_seq = 1;
1573         new_ctx->tcp_sock = newsk;
1574
1575         if (subflow_req->mp_capable) {
1576                 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1577                  * is fully established only after we receive the remote key
1578                  */
1579                 new_ctx->mp_capable = 1;
1580                 new_ctx->local_key = subflow_req->local_key;
1581                 new_ctx->token = subflow_req->token;
1582                 new_ctx->ssn_offset = subflow_req->ssn_offset;
1583                 new_ctx->idsn = subflow_req->idsn;
1584         } else if (subflow_req->mp_join) {
1585                 new_ctx->ssn_offset = subflow_req->ssn_offset;
1586                 new_ctx->mp_join = 1;
1587                 new_ctx->fully_established = 1;
1588                 new_ctx->backup = subflow_req->backup;
1589                 new_ctx->local_id = subflow_req->local_id;
1590                 new_ctx->remote_id = subflow_req->remote_id;
1591                 new_ctx->token = subflow_req->token;
1592                 new_ctx->thmac = subflow_req->thmac;
1593         }
1594 }
1595
1596 static void tcp_release_cb_override(struct sock *ssk)
1597 {
1598         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1599
1600         if (mptcp_subflow_has_delegated_action(subflow))
1601                 mptcp_subflow_process_delegated(ssk);
1602
1603         tcp_release_cb(ssk);
1604 }
1605
1606 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1607         .name           = "mptcp",
1608         .owner          = THIS_MODULE,
1609         .init           = subflow_ulp_init,
1610         .release        = subflow_ulp_release,
1611         .clone          = subflow_ulp_clone,
1612 };
1613
1614 static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1615 {
1616         subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1617         subflow_ops->slab_name = "request_sock_subflow";
1618
1619         subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1620                                               subflow_ops->obj_size, 0,
1621                                               SLAB_ACCOUNT |
1622                                               SLAB_TYPESAFE_BY_RCU,
1623                                               NULL);
1624         if (!subflow_ops->slab)
1625                 return -ENOMEM;
1626
1627         subflow_ops->destructor = subflow_req_destructor;
1628
1629         return 0;
1630 }
1631
1632 void __init mptcp_subflow_init(void)
1633 {
1634         mptcp_subflow_request_sock_ops = tcp_request_sock_ops;
1635         if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0)
1636                 panic("MPTCP: failed to init subflow request sock ops\n");
1637
1638         subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1639         subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
1640
1641         subflow_specific = ipv4_specific;
1642         subflow_specific.conn_request = subflow_v4_conn_request;
1643         subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1644         subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1645
1646         tcp_prot_override = tcp_prot;
1647         tcp_prot_override.release_cb = tcp_release_cb_override;
1648
1649 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1650         subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1651         subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
1652
1653         subflow_v6_specific = ipv6_specific;
1654         subflow_v6_specific.conn_request = subflow_v6_conn_request;
1655         subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1656         subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1657
1658         subflow_v6m_specific = subflow_v6_specific;
1659         subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1660         subflow_v6m_specific.send_check = ipv4_specific.send_check;
1661         subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1662         subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1663         subflow_v6m_specific.net_frag_header_len = 0;
1664
1665         tcpv6_prot_override = tcpv6_prot;
1666         tcpv6_prot_override.release_cb = tcp_release_cb_override;
1667 #endif
1668
1669         mptcp_diag_subflow_init(&subflow_ulp_ops);
1670
1671         if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1672                 panic("MPTCP: failed to register subflows to ULP\n");
1673 }