Merge tag 'efi-core-2020-06-01' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / net / mptcp / subflow.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6
7 #define pr_fmt(fmt) "MPTCP: " fmt
8
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha.h>
14 #include <net/sock.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
18 #include <net/tcp.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
21 #endif
22 #include <net/mptcp.h>
23 #include "protocol.h"
24 #include "mib.h"
25
26 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
27                                   enum linux_mptcp_mib_field field)
28 {
29         MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
30 }
31
32 static int subflow_rebuild_header(struct sock *sk)
33 {
34         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
35         int local_id, err = 0;
36
37         if (subflow->request_mptcp && !subflow->token) {
38                 pr_debug("subflow=%p", sk);
39                 err = mptcp_token_new_connect(sk);
40         } else if (subflow->request_join && !subflow->local_nonce) {
41                 struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
42
43                 pr_debug("subflow=%p", sk);
44
45                 do {
46                         get_random_bytes(&subflow->local_nonce, sizeof(u32));
47                 } while (!subflow->local_nonce);
48
49                 if (subflow->local_id)
50                         goto out;
51
52                 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
53                 if (local_id < 0)
54                         return -EINVAL;
55
56                 subflow->local_id = local_id;
57         }
58
59 out:
60         if (err)
61                 return err;
62
63         return subflow->icsk_af_ops->rebuild_header(sk);
64 }
65
66 static void subflow_req_destructor(struct request_sock *req)
67 {
68         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
69
70         pr_debug("subflow_req=%p", subflow_req);
71
72         if (subflow_req->mp_capable)
73                 mptcp_token_destroy_request(subflow_req->token);
74         tcp_request_sock_ops.destructor(req);
75 }
76
77 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
78                                   void *hmac)
79 {
80         u8 msg[8];
81
82         put_unaligned_be32(nonce1, &msg[0]);
83         put_unaligned_be32(nonce2, &msg[4]);
84
85         mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
86 }
87
88 /* validate received token and create truncated hmac and nonce for SYN-ACK */
89 static bool subflow_token_join_request(struct request_sock *req,
90                                        const struct sk_buff *skb)
91 {
92         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
93         u8 hmac[SHA256_DIGEST_SIZE];
94         struct mptcp_sock *msk;
95         int local_id;
96
97         msk = mptcp_token_get_sock(subflow_req->token);
98         if (!msk) {
99                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
100                 return false;
101         }
102
103         local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
104         if (local_id < 0) {
105                 sock_put((struct sock *)msk);
106                 return false;
107         }
108         subflow_req->local_id = local_id;
109
110         get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
111
112         subflow_generate_hmac(msk->local_key, msk->remote_key,
113                               subflow_req->local_nonce,
114                               subflow_req->remote_nonce, hmac);
115
116         subflow_req->thmac = get_unaligned_be64(hmac);
117
118         sock_put((struct sock *)msk);
119         return true;
120 }
121
122 static void subflow_init_req(struct request_sock *req,
123                              const struct sock *sk_listener,
124                              struct sk_buff *skb)
125 {
126         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
127         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
128         struct mptcp_options_received mp_opt;
129
130         pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
131
132         mptcp_get_options(skb, &mp_opt);
133
134         subflow_req->mp_capable = 0;
135         subflow_req->mp_join = 0;
136
137 #ifdef CONFIG_TCP_MD5SIG
138         /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
139          * TCP option space.
140          */
141         if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
142                 return;
143 #endif
144
145         if (mp_opt.mp_capable) {
146                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
147
148                 if (mp_opt.mp_join)
149                         return;
150         } else if (mp_opt.mp_join) {
151                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
152         }
153
154         if (mp_opt.mp_capable && listener->request_mptcp) {
155                 int err;
156
157                 err = mptcp_token_new_request(req);
158                 if (err == 0)
159                         subflow_req->mp_capable = 1;
160
161                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
162         } else if (mp_opt.mp_join && listener->request_mptcp) {
163                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
164                 subflow_req->mp_join = 1;
165                 subflow_req->backup = mp_opt.backup;
166                 subflow_req->remote_id = mp_opt.join_id;
167                 subflow_req->token = mp_opt.token;
168                 subflow_req->remote_nonce = mp_opt.nonce;
169                 pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
170                          subflow_req->remote_nonce);
171                 if (!subflow_token_join_request(req, skb)) {
172                         subflow_req->mp_join = 0;
173                         // @@ need to trigger RST
174                 }
175         }
176 }
177
178 static void subflow_v4_init_req(struct request_sock *req,
179                                 const struct sock *sk_listener,
180                                 struct sk_buff *skb)
181 {
182         tcp_rsk(req)->is_mptcp = 1;
183
184         tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
185
186         subflow_init_req(req, sk_listener, skb);
187 }
188
189 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
190 static void subflow_v6_init_req(struct request_sock *req,
191                                 const struct sock *sk_listener,
192                                 struct sk_buff *skb)
193 {
194         tcp_rsk(req)->is_mptcp = 1;
195
196         tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
197
198         subflow_init_req(req, sk_listener, skb);
199 }
200 #endif
201
202 /* validate received truncated hmac and create hmac for third ACK */
203 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
204 {
205         u8 hmac[SHA256_DIGEST_SIZE];
206         u64 thmac;
207
208         subflow_generate_hmac(subflow->remote_key, subflow->local_key,
209                               subflow->remote_nonce, subflow->local_nonce,
210                               hmac);
211
212         thmac = get_unaligned_be64(hmac);
213         pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
214                  subflow, subflow->token,
215                  (unsigned long long)thmac,
216                  (unsigned long long)subflow->thmac);
217
218         return thmac == subflow->thmac;
219 }
220
221 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
222 {
223         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
224         struct mptcp_options_received mp_opt;
225         struct sock *parent = subflow->conn;
226         struct tcp_sock *tp = tcp_sk(sk);
227
228         subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
229
230         if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
231                 inet_sk_state_store(parent, TCP_ESTABLISHED);
232                 parent->sk_state_change(parent);
233         }
234
235         /* be sure no special action on any packet other than syn-ack */
236         if (subflow->conn_finished)
237                 return;
238
239         subflow->conn_finished = 1;
240
241         mptcp_get_options(skb, &mp_opt);
242         if (subflow->request_mptcp && mp_opt.mp_capable) {
243                 subflow->mp_capable = 1;
244                 subflow->can_ack = 1;
245                 subflow->remote_key = mp_opt.sndr_key;
246                 pr_debug("subflow=%p, remote_key=%llu", subflow,
247                          subflow->remote_key);
248         } else if (subflow->request_join && mp_opt.mp_join) {
249                 subflow->mp_join = 1;
250                 subflow->thmac = mp_opt.thmac;
251                 subflow->remote_nonce = mp_opt.nonce;
252                 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
253                          subflow->thmac, subflow->remote_nonce);
254         } else if (subflow->request_mptcp) {
255                 tp->is_mptcp = 0;
256         }
257
258         if (!tp->is_mptcp)
259                 return;
260
261         if (subflow->mp_capable) {
262                 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
263                          subflow->remote_key);
264                 mptcp_finish_connect(sk);
265
266                 if (skb) {
267                         pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
268                         subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
269                 }
270         } else if (subflow->mp_join) {
271                 u8 hmac[SHA256_DIGEST_SIZE];
272
273                 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
274                          subflow, subflow->thmac,
275                          subflow->remote_nonce);
276                 if (!subflow_thmac_valid(subflow)) {
277                         MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
278                         subflow->mp_join = 0;
279                         goto do_reset;
280                 }
281
282                 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
283                                       subflow->local_nonce,
284                                       subflow->remote_nonce,
285                                       hmac);
286
287                 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
288
289                 if (skb)
290                         subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
291
292                 if (!mptcp_finish_join(sk))
293                         goto do_reset;
294
295                 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
296         } else {
297 do_reset:
298                 tcp_send_active_reset(sk, GFP_ATOMIC);
299                 tcp_done(sk);
300         }
301 }
302
303 static struct request_sock_ops subflow_request_sock_ops;
304 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
305
306 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
307 {
308         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
309
310         pr_debug("subflow=%p", subflow);
311
312         /* Never answer to SYNs sent to broadcast or multicast */
313         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
314                 goto drop;
315
316         return tcp_conn_request(&subflow_request_sock_ops,
317                                 &subflow_request_sock_ipv4_ops,
318                                 sk, skb);
319 drop:
320         tcp_listendrop(sk);
321         return 0;
322 }
323
324 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
325 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
326 static struct inet_connection_sock_af_ops subflow_v6_specific;
327 static struct inet_connection_sock_af_ops subflow_v6m_specific;
328
329 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
330 {
331         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
332
333         pr_debug("subflow=%p", subflow);
334
335         if (skb->protocol == htons(ETH_P_IP))
336                 return subflow_v4_conn_request(sk, skb);
337
338         if (!ipv6_unicast_destination(skb))
339                 goto drop;
340
341         return tcp_conn_request(&subflow_request_sock_ops,
342                                 &subflow_request_sock_ipv6_ops, sk, skb);
343
344 drop:
345         tcp_listendrop(sk);
346         return 0; /* don't send reset */
347 }
348 #endif
349
350 /* validate hmac received in third ACK */
351 static bool subflow_hmac_valid(const struct request_sock *req,
352                                const struct mptcp_options_received *mp_opt)
353 {
354         const struct mptcp_subflow_request_sock *subflow_req;
355         u8 hmac[SHA256_DIGEST_SIZE];
356         struct mptcp_sock *msk;
357         bool ret;
358
359         subflow_req = mptcp_subflow_rsk(req);
360         msk = mptcp_token_get_sock(subflow_req->token);
361         if (!msk)
362                 return false;
363
364         subflow_generate_hmac(msk->remote_key, msk->local_key,
365                               subflow_req->remote_nonce,
366                               subflow_req->local_nonce, hmac);
367
368         ret = true;
369         if (crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN))
370                 ret = false;
371
372         sock_put((struct sock *)msk);
373         return ret;
374 }
375
376 static void mptcp_sock_destruct(struct sock *sk)
377 {
378         /* if new mptcp socket isn't accepted, it is free'd
379          * from the tcp listener sockets request queue, linked
380          * from req->sk.  The tcp socket is released.
381          * This calls the ULP release function which will
382          * also remove the mptcp socket, via
383          * sock_put(ctx->conn).
384          *
385          * Problem is that the mptcp socket will not be in
386          * SYN_RECV state and doesn't have SOCK_DEAD flag.
387          * Both result in warnings from inet_sock_destruct.
388          */
389
390         if (sk->sk_state == TCP_SYN_RECV) {
391                 sk->sk_state = TCP_CLOSE;
392                 WARN_ON_ONCE(sk->sk_socket);
393                 sock_orphan(sk);
394         }
395
396         inet_sock_destruct(sk);
397 }
398
399 static void mptcp_force_close(struct sock *sk)
400 {
401         inet_sk_state_store(sk, TCP_CLOSE);
402         sk_common_release(sk);
403 }
404
405 static void subflow_ulp_fallback(struct sock *sk,
406                                  struct mptcp_subflow_context *old_ctx)
407 {
408         struct inet_connection_sock *icsk = inet_csk(sk);
409
410         mptcp_subflow_tcp_fallback(sk, old_ctx);
411         icsk->icsk_ulp_ops = NULL;
412         rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
413         tcp_sk(sk)->is_mptcp = 0;
414 }
415
416 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
417                                           struct sk_buff *skb,
418                                           struct request_sock *req,
419                                           struct dst_entry *dst,
420                                           struct request_sock *req_unhash,
421                                           bool *own_req)
422 {
423         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
424         struct mptcp_subflow_request_sock *subflow_req;
425         struct mptcp_options_received mp_opt;
426         bool fallback_is_fatal = false;
427         struct sock *new_msk = NULL;
428         bool fallback = false;
429         struct sock *child;
430
431         pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
432
433         /* we need later a valid 'mp_capable' value even when options are not
434          * parsed
435          */
436         mp_opt.mp_capable = 0;
437         if (tcp_rsk(req)->is_mptcp == 0)
438                 goto create_child;
439
440         /* if the sk is MP_CAPABLE, we try to fetch the client key */
441         subflow_req = mptcp_subflow_rsk(req);
442         if (subflow_req->mp_capable) {
443                 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
444                         /* here we can receive and accept an in-window,
445                          * out-of-order pkt, which will not carry the MP_CAPABLE
446                          * opt even on mptcp enabled paths
447                          */
448                         goto create_msk;
449                 }
450
451                 mptcp_get_options(skb, &mp_opt);
452                 if (!mp_opt.mp_capable) {
453                         fallback = true;
454                         goto create_child;
455                 }
456
457 create_msk:
458                 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
459                 if (!new_msk)
460                         fallback = true;
461         } else if (subflow_req->mp_join) {
462                 fallback_is_fatal = true;
463                 mptcp_get_options(skb, &mp_opt);
464                 if (!mp_opt.mp_join ||
465                     !subflow_hmac_valid(req, &mp_opt)) {
466                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
467                         return NULL;
468                 }
469         }
470
471 create_child:
472         child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
473                                                      req_unhash, own_req);
474
475         if (child && *own_req) {
476                 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
477
478                 /* we need to fallback on ctx allocation failure and on pre-reqs
479                  * checking above. In the latter scenario we additionally need
480                  * to reset the context to non MPTCP status.
481                  */
482                 if (!ctx || fallback) {
483                         if (fallback_is_fatal)
484                                 goto close_child;
485
486                         if (ctx) {
487                                 subflow_ulp_fallback(child, ctx);
488                                 kfree_rcu(ctx, rcu);
489                         }
490                         goto out;
491                 }
492
493                 if (ctx->mp_capable) {
494                         /* new mpc subflow takes ownership of the newly
495                          * created mptcp socket
496                          */
497                         new_msk->sk_destruct = mptcp_sock_destruct;
498                         mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
499                         ctx->conn = new_msk;
500                         new_msk = NULL;
501
502                         /* with OoO packets we can reach here without ingress
503                          * mpc option
504                          */
505                         ctx->remote_key = mp_opt.sndr_key;
506                         ctx->fully_established = mp_opt.mp_capable;
507                         ctx->can_ack = mp_opt.mp_capable;
508                 } else if (ctx->mp_join) {
509                         struct mptcp_sock *owner;
510
511                         owner = mptcp_token_get_sock(ctx->token);
512                         if (!owner)
513                                 goto close_child;
514
515                         ctx->conn = (struct sock *)owner;
516                         if (!mptcp_finish_join(child))
517                                 goto close_child;
518
519                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
520                 }
521         }
522
523 out:
524         /* dispose of the left over mptcp master, if any */
525         if (unlikely(new_msk))
526                 mptcp_force_close(new_msk);
527
528         /* check for expected invariant - should never trigger, just help
529          * catching eariler subtle bugs
530          */
531         WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
532                      (!mptcp_subflow_ctx(child) ||
533                       !mptcp_subflow_ctx(child)->conn));
534         return child;
535
536 close_child:
537         tcp_send_active_reset(child, GFP_ATOMIC);
538         inet_csk_prepare_forced_close(child);
539         tcp_done(child);
540         return NULL;
541 }
542
543 static struct inet_connection_sock_af_ops subflow_specific;
544
545 enum mapping_status {
546         MAPPING_OK,
547         MAPPING_INVALID,
548         MAPPING_EMPTY,
549         MAPPING_DATA_FIN
550 };
551
552 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
553 {
554         if ((u32)seq == (u32)old_seq)
555                 return old_seq;
556
557         /* Assume map covers data not mapped yet. */
558         return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
559 }
560
561 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
562 {
563         WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
564                   ssn, subflow->map_subflow_seq, subflow->map_data_len);
565 }
566
567 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
568 {
569         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
570         unsigned int skb_consumed;
571
572         skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
573         if (WARN_ON_ONCE(skb_consumed >= skb->len))
574                 return true;
575
576         return skb->len - skb_consumed <= subflow->map_data_len -
577                                           mptcp_subflow_get_map_offset(subflow);
578 }
579
580 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
581 {
582         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
583         u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
584
585         if (unlikely(before(ssn, subflow->map_subflow_seq))) {
586                 /* Mapping covers data later in the subflow stream,
587                  * currently unsupported.
588                  */
589                 warn_bad_map(subflow, ssn);
590                 return false;
591         }
592         if (unlikely(!before(ssn, subflow->map_subflow_seq +
593                                   subflow->map_data_len))) {
594                 /* Mapping does covers past subflow data, invalid */
595                 warn_bad_map(subflow, ssn + skb->len);
596                 return false;
597         }
598         return true;
599 }
600
601 static enum mapping_status get_mapping_status(struct sock *ssk)
602 {
603         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
604         struct mptcp_ext *mpext;
605         struct sk_buff *skb;
606         u16 data_len;
607         u64 map_seq;
608
609         skb = skb_peek(&ssk->sk_receive_queue);
610         if (!skb)
611                 return MAPPING_EMPTY;
612
613         mpext = mptcp_get_ext(skb);
614         if (!mpext || !mpext->use_map) {
615                 if (!subflow->map_valid && !skb->len) {
616                         /* the TCP stack deliver 0 len FIN pkt to the receive
617                          * queue, that is the only 0len pkts ever expected here,
618                          * and we can admit no mapping only for 0 len pkts
619                          */
620                         if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
621                                 WARN_ONCE(1, "0len seq %d:%d flags %x",
622                                           TCP_SKB_CB(skb)->seq,
623                                           TCP_SKB_CB(skb)->end_seq,
624                                           TCP_SKB_CB(skb)->tcp_flags);
625                         sk_eat_skb(ssk, skb);
626                         return MAPPING_EMPTY;
627                 }
628
629                 if (!subflow->map_valid)
630                         return MAPPING_INVALID;
631
632                 goto validate_seq;
633         }
634
635         pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
636                  mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
637                  mpext->data_len, mpext->data_fin);
638
639         data_len = mpext->data_len;
640         if (data_len == 0) {
641                 pr_err("Infinite mapping not handled");
642                 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
643                 return MAPPING_INVALID;
644         }
645
646         if (mpext->data_fin == 1) {
647                 if (data_len == 1) {
648                         pr_debug("DATA_FIN with no payload");
649                         if (subflow->map_valid) {
650                                 /* A DATA_FIN might arrive in a DSS
651                                  * option before the previous mapping
652                                  * has been fully consumed. Continue
653                                  * handling the existing mapping.
654                                  */
655                                 skb_ext_del(skb, SKB_EXT_MPTCP);
656                                 return MAPPING_OK;
657                         } else {
658                                 return MAPPING_DATA_FIN;
659                         }
660                 }
661
662                 /* Adjust for DATA_FIN using 1 byte of sequence space */
663                 data_len--;
664         }
665
666         if (!mpext->dsn64) {
667                 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
668                                      mpext->data_seq);
669                 pr_debug("expanded seq=%llu", subflow->map_seq);
670         } else {
671                 map_seq = mpext->data_seq;
672         }
673
674         if (subflow->map_valid) {
675                 /* Allow replacing only with an identical map */
676                 if (subflow->map_seq == map_seq &&
677                     subflow->map_subflow_seq == mpext->subflow_seq &&
678                     subflow->map_data_len == data_len) {
679                         skb_ext_del(skb, SKB_EXT_MPTCP);
680                         return MAPPING_OK;
681                 }
682
683                 /* If this skb data are fully covered by the current mapping,
684                  * the new map would need caching, which is not supported
685                  */
686                 if (skb_is_fully_mapped(ssk, skb)) {
687                         MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
688                         return MAPPING_INVALID;
689                 }
690
691                 /* will validate the next map after consuming the current one */
692                 return MAPPING_OK;
693         }
694
695         subflow->map_seq = map_seq;
696         subflow->map_subflow_seq = mpext->subflow_seq;
697         subflow->map_data_len = data_len;
698         subflow->map_valid = 1;
699         subflow->mpc_map = mpext->mpc_map;
700         pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
701                  subflow->map_seq, subflow->map_subflow_seq,
702                  subflow->map_data_len);
703
704 validate_seq:
705         /* we revalidate valid mapping on new skb, because we must ensure
706          * the current skb is completely covered by the available mapping
707          */
708         if (!validate_mapping(ssk, skb))
709                 return MAPPING_INVALID;
710
711         skb_ext_del(skb, SKB_EXT_MPTCP);
712         return MAPPING_OK;
713 }
714
715 static int subflow_read_actor(read_descriptor_t *desc,
716                               struct sk_buff *skb,
717                               unsigned int offset, size_t len)
718 {
719         size_t copy_len = min(desc->count, len);
720
721         desc->count -= copy_len;
722
723         pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
724         return copy_len;
725 }
726
727 static bool subflow_check_data_avail(struct sock *ssk)
728 {
729         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
730         enum mapping_status status;
731         struct mptcp_sock *msk;
732         struct sk_buff *skb;
733
734         pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
735                  subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
736         if (subflow->data_avail)
737                 return true;
738
739         msk = mptcp_sk(subflow->conn);
740         for (;;) {
741                 u32 map_remaining;
742                 size_t delta;
743                 u64 ack_seq;
744                 u64 old_ack;
745
746                 status = get_mapping_status(ssk);
747                 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
748                 if (status == MAPPING_INVALID) {
749                         ssk->sk_err = EBADMSG;
750                         goto fatal;
751                 }
752
753                 if (status != MAPPING_OK)
754                         return false;
755
756                 skb = skb_peek(&ssk->sk_receive_queue);
757                 if (WARN_ON_ONCE(!skb))
758                         return false;
759
760                 /* if msk lacks the remote key, this subflow must provide an
761                  * MP_CAPABLE-based mapping
762                  */
763                 if (unlikely(!READ_ONCE(msk->can_ack))) {
764                         if (!subflow->mpc_map) {
765                                 ssk->sk_err = EBADMSG;
766                                 goto fatal;
767                         }
768                         WRITE_ONCE(msk->remote_key, subflow->remote_key);
769                         WRITE_ONCE(msk->ack_seq, subflow->map_seq);
770                         WRITE_ONCE(msk->can_ack, true);
771                 }
772
773                 old_ack = READ_ONCE(msk->ack_seq);
774                 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
775                 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
776                          ack_seq);
777                 if (ack_seq == old_ack)
778                         break;
779
780                 /* only accept in-sequence mapping. Old values are spurious
781                  * retransmission; we can hit "future" values on active backup
782                  * subflow switch, we relay on retransmissions to get
783                  * in-sequence data.
784                  * Cuncurrent subflows support will require subflow data
785                  * reordering
786                  */
787                 map_remaining = subflow->map_data_len -
788                                 mptcp_subflow_get_map_offset(subflow);
789                 if (before64(ack_seq, old_ack))
790                         delta = min_t(size_t, old_ack - ack_seq, map_remaining);
791                 else
792                         delta = min_t(size_t, ack_seq - old_ack, map_remaining);
793
794                 /* discard mapped data */
795                 pr_debug("discarding %zu bytes, current map len=%d", delta,
796                          map_remaining);
797                 if (delta) {
798                         read_descriptor_t desc = {
799                                 .count = delta,
800                         };
801                         int ret;
802
803                         ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
804                         if (ret < 0) {
805                                 ssk->sk_err = -ret;
806                                 goto fatal;
807                         }
808                         if (ret < delta)
809                                 return false;
810                         if (delta == map_remaining)
811                                 subflow->map_valid = 0;
812                 }
813         }
814         return true;
815
816 fatal:
817         /* fatal protocol error, close the socket */
818         /* This barrier is coupled with smp_rmb() in tcp_poll() */
819         smp_wmb();
820         ssk->sk_error_report(ssk);
821         tcp_set_state(ssk, TCP_CLOSE);
822         tcp_send_active_reset(ssk, GFP_ATOMIC);
823         return false;
824 }
825
826 bool mptcp_subflow_data_available(struct sock *sk)
827 {
828         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
829         struct sk_buff *skb;
830
831         /* check if current mapping is still valid */
832         if (subflow->map_valid &&
833             mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
834                 subflow->map_valid = 0;
835                 subflow->data_avail = 0;
836
837                 pr_debug("Done with mapping: seq=%u data_len=%u",
838                          subflow->map_subflow_seq,
839                          subflow->map_data_len);
840         }
841
842         if (!subflow_check_data_avail(sk)) {
843                 subflow->data_avail = 0;
844                 return false;
845         }
846
847         skb = skb_peek(&sk->sk_receive_queue);
848         subflow->data_avail = skb &&
849                        before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
850         return subflow->data_avail;
851 }
852
853 static void subflow_data_ready(struct sock *sk)
854 {
855         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
856         struct sock *parent = subflow->conn;
857
858         if (!subflow->mp_capable && !subflow->mp_join) {
859                 subflow->tcp_data_ready(sk);
860
861                 parent->sk_data_ready(parent);
862                 return;
863         }
864
865         if (mptcp_subflow_data_available(sk))
866                 mptcp_data_ready(parent, sk);
867 }
868
869 static void subflow_write_space(struct sock *sk)
870 {
871         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
872         struct sock *parent = subflow->conn;
873
874         sk_stream_write_space(sk);
875         if (sk_stream_is_writeable(sk)) {
876                 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
877                 smp_mb__after_atomic();
878                 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
879                 sk_stream_write_space(parent);
880         }
881 }
882
883 static struct inet_connection_sock_af_ops *
884 subflow_default_af_ops(struct sock *sk)
885 {
886 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
887         if (sk->sk_family == AF_INET6)
888                 return &subflow_v6_specific;
889 #endif
890         return &subflow_specific;
891 }
892
893 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
894 void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
895 {
896         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
897         struct inet_connection_sock *icsk = inet_csk(sk);
898         struct inet_connection_sock_af_ops *target;
899
900         target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
901
902         pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
903                  subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
904
905         if (likely(icsk->icsk_af_ops == target))
906                 return;
907
908         subflow->icsk_af_ops = icsk->icsk_af_ops;
909         icsk->icsk_af_ops = target;
910 }
911 #endif
912
913 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
914                                 struct sockaddr_storage *addr)
915 {
916         memset(addr, 0, sizeof(*addr));
917         addr->ss_family = info->family;
918         if (addr->ss_family == AF_INET) {
919                 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
920
921                 in_addr->sin_addr = info->addr;
922                 in_addr->sin_port = info->port;
923         }
924 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
925         else if (addr->ss_family == AF_INET6) {
926                 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
927
928                 in6_addr->sin6_addr = info->addr6;
929                 in6_addr->sin6_port = info->port;
930         }
931 #endif
932 }
933
934 int __mptcp_subflow_connect(struct sock *sk, int ifindex,
935                             const struct mptcp_addr_info *loc,
936                             const struct mptcp_addr_info *remote)
937 {
938         struct mptcp_sock *msk = mptcp_sk(sk);
939         struct mptcp_subflow_context *subflow;
940         struct sockaddr_storage addr;
941         struct socket *sf;
942         u32 remote_token;
943         int addrlen;
944         int err;
945
946         if (sk->sk_state != TCP_ESTABLISHED)
947                 return -ENOTCONN;
948
949         err = mptcp_subflow_create_socket(sk, &sf);
950         if (err)
951                 return err;
952
953         subflow = mptcp_subflow_ctx(sf->sk);
954         subflow->remote_key = msk->remote_key;
955         subflow->local_key = msk->local_key;
956         subflow->token = msk->token;
957         mptcp_info2sockaddr(loc, &addr);
958
959         addrlen = sizeof(struct sockaddr_in);
960 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
961         if (loc->family == AF_INET6)
962                 addrlen = sizeof(struct sockaddr_in6);
963 #endif
964         sf->sk->sk_bound_dev_if = ifindex;
965         err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
966         if (err)
967                 goto failed;
968
969         mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
970         pr_debug("msk=%p remote_token=%u", msk, remote_token);
971         subflow->remote_token = remote_token;
972         subflow->local_id = loc->id;
973         subflow->request_join = 1;
974         subflow->request_bkup = 1;
975         mptcp_info2sockaddr(remote, &addr);
976
977         err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
978         if (err && err != -EINPROGRESS)
979                 goto failed;
980
981         spin_lock_bh(&msk->join_list_lock);
982         list_add_tail(&subflow->node, &msk->join_list);
983         spin_unlock_bh(&msk->join_list_lock);
984
985         return err;
986
987 failed:
988         sock_release(sf);
989         return err;
990 }
991
992 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
993 {
994         struct mptcp_subflow_context *subflow;
995         struct net *net = sock_net(sk);
996         struct socket *sf;
997         int err;
998
999         err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
1000                                &sf);
1001         if (err)
1002                 return err;
1003
1004         lock_sock(sf->sk);
1005
1006         /* kernel sockets do not by default acquire net ref, but TCP timer
1007          * needs it.
1008          */
1009         sf->sk->sk_net_refcnt = 1;
1010         get_net(net);
1011 #ifdef CONFIG_PROC_FS
1012         this_cpu_add(*net->core.sock_inuse, 1);
1013 #endif
1014         err = tcp_set_ulp(sf->sk, "mptcp");
1015         release_sock(sf->sk);
1016
1017         if (err)
1018                 return err;
1019
1020         /* the newly created socket really belongs to the owning MPTCP master
1021          * socket, even if for additional subflows the allocation is performed
1022          * by a kernel workqueue. Adjust inode references, so that the
1023          * procfs/diag interaces really show this one belonging to the correct
1024          * user.
1025          */
1026         SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1027         SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1028         SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1029
1030         subflow = mptcp_subflow_ctx(sf->sk);
1031         pr_debug("subflow=%p", subflow);
1032
1033         *new_sock = sf;
1034         sock_hold(sk);
1035         subflow->conn = sk;
1036
1037         return 0;
1038 }
1039
1040 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1041                                                         gfp_t priority)
1042 {
1043         struct inet_connection_sock *icsk = inet_csk(sk);
1044         struct mptcp_subflow_context *ctx;
1045
1046         ctx = kzalloc(sizeof(*ctx), priority);
1047         if (!ctx)
1048                 return NULL;
1049
1050         rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
1051         INIT_LIST_HEAD(&ctx->node);
1052
1053         pr_debug("subflow=%p", ctx);
1054
1055         ctx->tcp_sock = sk;
1056
1057         return ctx;
1058 }
1059
1060 static void __subflow_state_change(struct sock *sk)
1061 {
1062         struct socket_wq *wq;
1063
1064         rcu_read_lock();
1065         wq = rcu_dereference(sk->sk_wq);
1066         if (skwq_has_sleeper(wq))
1067                 wake_up_interruptible_all(&wq->wait);
1068         rcu_read_unlock();
1069 }
1070
1071 static bool subflow_is_done(const struct sock *sk)
1072 {
1073         return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1074 }
1075
1076 static void subflow_state_change(struct sock *sk)
1077 {
1078         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1079         struct sock *parent = subflow->conn;
1080
1081         __subflow_state_change(sk);
1082
1083         /* as recvmsg() does not acquire the subflow socket for ssk selection
1084          * a fin packet carrying a DSS can be unnoticed if we don't trigger
1085          * the data available machinery here.
1086          */
1087         if (subflow->mp_capable && mptcp_subflow_data_available(sk))
1088                 mptcp_data_ready(parent, sk);
1089
1090         if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
1091             !subflow->rx_eof && subflow_is_done(sk)) {
1092                 subflow->rx_eof = 1;
1093                 mptcp_subflow_eof(parent);
1094         }
1095 }
1096
1097 static int subflow_ulp_init(struct sock *sk)
1098 {
1099         struct inet_connection_sock *icsk = inet_csk(sk);
1100         struct mptcp_subflow_context *ctx;
1101         struct tcp_sock *tp = tcp_sk(sk);
1102         int err = 0;
1103
1104         /* disallow attaching ULP to a socket unless it has been
1105          * created with sock_create_kern()
1106          */
1107         if (!sk->sk_kern_sock) {
1108                 err = -EOPNOTSUPP;
1109                 goto out;
1110         }
1111
1112         ctx = subflow_create_ctx(sk, GFP_KERNEL);
1113         if (!ctx) {
1114                 err = -ENOMEM;
1115                 goto out;
1116         }
1117
1118         pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1119
1120         tp->is_mptcp = 1;
1121         ctx->icsk_af_ops = icsk->icsk_af_ops;
1122         icsk->icsk_af_ops = subflow_default_af_ops(sk);
1123         ctx->tcp_data_ready = sk->sk_data_ready;
1124         ctx->tcp_state_change = sk->sk_state_change;
1125         ctx->tcp_write_space = sk->sk_write_space;
1126         sk->sk_data_ready = subflow_data_ready;
1127         sk->sk_write_space = subflow_write_space;
1128         sk->sk_state_change = subflow_state_change;
1129 out:
1130         return err;
1131 }
1132
1133 static void subflow_ulp_release(struct sock *sk)
1134 {
1135         struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
1136
1137         if (!ctx)
1138                 return;
1139
1140         if (ctx->conn)
1141                 sock_put(ctx->conn);
1142
1143         kfree_rcu(ctx, rcu);
1144 }
1145
1146 static void subflow_ulp_clone(const struct request_sock *req,
1147                               struct sock *newsk,
1148                               const gfp_t priority)
1149 {
1150         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1151         struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1152         struct mptcp_subflow_context *new_ctx;
1153
1154         if (!tcp_rsk(req)->is_mptcp ||
1155             (!subflow_req->mp_capable && !subflow_req->mp_join)) {
1156                 subflow_ulp_fallback(newsk, old_ctx);
1157                 return;
1158         }
1159
1160         new_ctx = subflow_create_ctx(newsk, priority);
1161         if (!new_ctx) {
1162                 subflow_ulp_fallback(newsk, old_ctx);
1163                 return;
1164         }
1165
1166         new_ctx->conn_finished = 1;
1167         new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
1168         new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1169         new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1170         new_ctx->tcp_write_space = old_ctx->tcp_write_space;
1171         new_ctx->rel_write_seq = 1;
1172         new_ctx->tcp_sock = newsk;
1173
1174         if (subflow_req->mp_capable) {
1175                 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1176                  * is fully established only after we receive the remote key
1177                  */
1178                 new_ctx->mp_capable = 1;
1179                 new_ctx->local_key = subflow_req->local_key;
1180                 new_ctx->token = subflow_req->token;
1181                 new_ctx->ssn_offset = subflow_req->ssn_offset;
1182                 new_ctx->idsn = subflow_req->idsn;
1183         } else if (subflow_req->mp_join) {
1184                 new_ctx->ssn_offset = subflow_req->ssn_offset;
1185                 new_ctx->mp_join = 1;
1186                 new_ctx->fully_established = 1;
1187                 new_ctx->backup = subflow_req->backup;
1188                 new_ctx->local_id = subflow_req->local_id;
1189                 new_ctx->token = subflow_req->token;
1190                 new_ctx->thmac = subflow_req->thmac;
1191         }
1192 }
1193
1194 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1195         .name           = "mptcp",
1196         .owner          = THIS_MODULE,
1197         .init           = subflow_ulp_init,
1198         .release        = subflow_ulp_release,
1199         .clone          = subflow_ulp_clone,
1200 };
1201
1202 static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1203 {
1204         subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1205         subflow_ops->slab_name = "request_sock_subflow";
1206
1207         subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1208                                               subflow_ops->obj_size, 0,
1209                                               SLAB_ACCOUNT |
1210                                               SLAB_TYPESAFE_BY_RCU,
1211                                               NULL);
1212         if (!subflow_ops->slab)
1213                 return -ENOMEM;
1214
1215         subflow_ops->destructor = subflow_req_destructor;
1216
1217         return 0;
1218 }
1219
1220 void mptcp_subflow_init(void)
1221 {
1222         subflow_request_sock_ops = tcp_request_sock_ops;
1223         if (subflow_ops_init(&subflow_request_sock_ops) != 0)
1224                 panic("MPTCP: failed to init subflow request sock ops\n");
1225
1226         subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1227         subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
1228
1229         subflow_specific = ipv4_specific;
1230         subflow_specific.conn_request = subflow_v4_conn_request;
1231         subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1232         subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1233         subflow_specific.rebuild_header = subflow_rebuild_header;
1234
1235 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1236         subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1237         subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
1238
1239         subflow_v6_specific = ipv6_specific;
1240         subflow_v6_specific.conn_request = subflow_v6_conn_request;
1241         subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1242         subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1243         subflow_v6_specific.rebuild_header = subflow_rebuild_header;
1244
1245         subflow_v6m_specific = subflow_v6_specific;
1246         subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1247         subflow_v6m_specific.send_check = ipv4_specific.send_check;
1248         subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1249         subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1250         subflow_v6m_specific.net_frag_header_len = 0;
1251 #endif
1252
1253         mptcp_diag_subflow_init(&subflow_ulp_ops);
1254
1255         if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1256                 panic("MPTCP: failed to register subflows to ULP\n");
1257 }