1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
14 #include <net/inet_common.h>
15 #include <net/inet_hashtables.h>
16 #include <net/protocol.h>
18 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
19 #include <net/ip6_route.h>
21 #include <net/mptcp.h>
25 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
26 enum linux_mptcp_mib_field field)
28 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
31 static int subflow_rebuild_header(struct sock *sk)
33 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
34 int local_id, err = 0;
36 if (subflow->request_mptcp && !subflow->token) {
37 pr_debug("subflow=%p", sk);
38 err = mptcp_token_new_connect(sk);
39 } else if (subflow->request_join && !subflow->local_nonce) {
40 struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
42 pr_debug("subflow=%p", sk);
45 get_random_bytes(&subflow->local_nonce, sizeof(u32));
46 } while (!subflow->local_nonce);
48 if (subflow->local_id)
51 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
55 subflow->local_id = local_id;
62 return subflow->icsk_af_ops->rebuild_header(sk);
65 static void subflow_req_destructor(struct request_sock *req)
67 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
69 pr_debug("subflow_req=%p", subflow_req);
71 if (subflow_req->mp_capable)
72 mptcp_token_destroy_request(subflow_req->token);
73 tcp_request_sock_ops.destructor(req);
76 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
81 put_unaligned_be32(nonce1, &msg[0]);
82 put_unaligned_be32(nonce2, &msg[4]);
84 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
87 /* validate received token and create truncated hmac and nonce for SYN-ACK */
88 static bool subflow_token_join_request(struct request_sock *req,
89 const struct sk_buff *skb)
91 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
92 u8 hmac[MPTCPOPT_HMAC_LEN];
93 struct mptcp_sock *msk;
96 msk = mptcp_token_get_sock(subflow_req->token);
98 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
102 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
104 sock_put((struct sock *)msk);
107 subflow_req->local_id = local_id;
109 get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
111 subflow_generate_hmac(msk->local_key, msk->remote_key,
112 subflow_req->local_nonce,
113 subflow_req->remote_nonce, hmac);
115 subflow_req->thmac = get_unaligned_be64(hmac);
117 sock_put((struct sock *)msk);
121 static void subflow_init_req(struct request_sock *req,
122 const struct sock *sk_listener,
125 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
126 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
127 struct tcp_options_received rx_opt;
129 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
131 memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
132 mptcp_get_options(skb, &rx_opt);
134 subflow_req->mp_capable = 0;
135 subflow_req->mp_join = 0;
136 subflow_req->remote_key_valid = 0;
138 #ifdef CONFIG_TCP_MD5SIG
139 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
142 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
146 if (rx_opt.mptcp.mp_capable) {
147 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
149 if (rx_opt.mptcp.mp_join)
151 } else if (rx_opt.mptcp.mp_join) {
152 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
155 if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
158 err = mptcp_token_new_request(req);
160 subflow_req->mp_capable = 1;
162 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
163 } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
164 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
165 subflow_req->mp_join = 1;
166 subflow_req->backup = rx_opt.mptcp.backup;
167 subflow_req->remote_id = rx_opt.mptcp.join_id;
168 subflow_req->token = rx_opt.mptcp.token;
169 subflow_req->remote_nonce = rx_opt.mptcp.nonce;
170 pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
171 subflow_req->remote_nonce);
172 if (!subflow_token_join_request(req, skb)) {
173 subflow_req->mp_join = 0;
174 // @@ need to trigger RST
179 static void subflow_v4_init_req(struct request_sock *req,
180 const struct sock *sk_listener,
183 tcp_rsk(req)->is_mptcp = 1;
185 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
187 subflow_init_req(req, sk_listener, skb);
190 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
191 static void subflow_v6_init_req(struct request_sock *req,
192 const struct sock *sk_listener,
195 tcp_rsk(req)->is_mptcp = 1;
197 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
199 subflow_init_req(req, sk_listener, skb);
203 /* validate received truncated hmac and create hmac for third ACK */
204 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
206 u8 hmac[MPTCPOPT_HMAC_LEN];
209 subflow_generate_hmac(subflow->remote_key, subflow->local_key,
210 subflow->remote_nonce, subflow->local_nonce,
213 thmac = get_unaligned_be64(hmac);
214 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
215 subflow, subflow->token,
216 (unsigned long long)thmac,
217 (unsigned long long)subflow->thmac);
219 return thmac == subflow->thmac;
222 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
224 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
225 struct sock *parent = subflow->conn;
227 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
229 if (inet_sk_state_load(parent) != TCP_ESTABLISHED) {
230 inet_sk_state_store(parent, TCP_ESTABLISHED);
231 parent->sk_state_change(parent);
234 if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
237 if (subflow->mp_capable) {
238 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
239 subflow->remote_key);
240 mptcp_finish_connect(sk);
241 subflow->conn_finished = 1;
244 pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
245 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
247 } else if (subflow->mp_join) {
248 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
249 subflow, subflow->thmac,
250 subflow->remote_nonce);
251 if (!subflow_thmac_valid(subflow)) {
252 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
253 subflow->mp_join = 0;
257 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
258 subflow->local_nonce,
259 subflow->remote_nonce,
263 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
265 if (!mptcp_finish_join(sk))
268 subflow->conn_finished = 1;
269 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
272 tcp_send_active_reset(sk, GFP_ATOMIC);
277 static struct request_sock_ops subflow_request_sock_ops;
278 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
280 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
282 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
284 pr_debug("subflow=%p", subflow);
286 /* Never answer to SYNs sent to broadcast or multicast */
287 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
290 return tcp_conn_request(&subflow_request_sock_ops,
291 &subflow_request_sock_ipv4_ops,
298 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
299 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
300 static struct inet_connection_sock_af_ops subflow_v6_specific;
301 static struct inet_connection_sock_af_ops subflow_v6m_specific;
303 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
305 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
307 pr_debug("subflow=%p", subflow);
309 if (skb->protocol == htons(ETH_P_IP))
310 return subflow_v4_conn_request(sk, skb);
312 if (!ipv6_unicast_destination(skb))
315 return tcp_conn_request(&subflow_request_sock_ops,
316 &subflow_request_sock_ipv6_ops, sk, skb);
320 return 0; /* don't send reset */
324 /* validate hmac received in third ACK */
325 static bool subflow_hmac_valid(const struct request_sock *req,
326 const struct tcp_options_received *rx_opt)
328 const struct mptcp_subflow_request_sock *subflow_req;
329 u8 hmac[MPTCPOPT_HMAC_LEN];
330 struct mptcp_sock *msk;
333 subflow_req = mptcp_subflow_rsk(req);
334 msk = mptcp_token_get_sock(subflow_req->token);
338 subflow_generate_hmac(msk->remote_key, msk->local_key,
339 subflow_req->remote_nonce,
340 subflow_req->local_nonce, hmac);
343 if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
346 sock_put((struct sock *)msk);
350 static void mptcp_sock_destruct(struct sock *sk)
352 /* if new mptcp socket isn't accepted, it is free'd
353 * from the tcp listener sockets request queue, linked
354 * from req->sk. The tcp socket is released.
355 * This calls the ULP release function which will
356 * also remove the mptcp socket, via
357 * sock_put(ctx->conn).
359 * Problem is that the mptcp socket will not be in
360 * SYN_RECV state and doesn't have SOCK_DEAD flag.
361 * Both result in warnings from inet_sock_destruct.
364 if (sk->sk_state == TCP_SYN_RECV) {
365 sk->sk_state = TCP_CLOSE;
366 WARN_ON_ONCE(sk->sk_socket);
370 inet_sock_destruct(sk);
373 static void mptcp_force_close(struct sock *sk)
375 inet_sk_state_store(sk, TCP_CLOSE);
376 sk_common_release(sk);
379 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
381 struct request_sock *req,
382 struct dst_entry *dst,
383 struct request_sock *req_unhash,
386 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
387 struct mptcp_subflow_request_sock *subflow_req;
388 struct tcp_options_received opt_rx;
389 bool fallback_is_fatal = false;
390 struct sock *new_msk = NULL;
393 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
395 if (tcp_rsk(req)->is_mptcp == 0)
398 /* if the sk is MP_CAPABLE, we try to fetch the client key */
399 subflow_req = mptcp_subflow_rsk(req);
400 if (subflow_req->mp_capable) {
401 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
402 /* here we can receive and accept an in-window,
403 * out-of-order pkt, which will not carry the MP_CAPABLE
404 * opt even on mptcp enabled paths
409 opt_rx.mptcp.mp_capable = 0;
410 mptcp_get_options(skb, &opt_rx);
411 if (opt_rx.mptcp.mp_capable) {
412 subflow_req->remote_key = opt_rx.mptcp.sndr_key;
413 subflow_req->remote_key_valid = 1;
415 subflow_req->mp_capable = 0;
420 new_msk = mptcp_sk_clone(listener->conn, req);
422 subflow_req->mp_capable = 0;
423 } else if (subflow_req->mp_join) {
424 fallback_is_fatal = true;
425 opt_rx.mptcp.mp_join = 0;
426 mptcp_get_options(skb, &opt_rx);
427 if (!opt_rx.mptcp.mp_join ||
428 !subflow_hmac_valid(req, &opt_rx)) {
429 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
435 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
436 req_unhash, own_req);
438 if (child && *own_req) {
439 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
441 /* we have null ctx on TCP fallback, which is fatal on
445 if (fallback_is_fatal)
450 if (ctx->mp_capable) {
451 /* new mpc subflow takes ownership of the newly
452 * created mptcp socket
454 new_msk->sk_destruct = mptcp_sock_destruct;
455 mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
458 } else if (ctx->mp_join) {
459 struct mptcp_sock *owner;
461 owner = mptcp_token_get_sock(ctx->token);
465 ctx->conn = (struct sock *)owner;
466 if (!mptcp_finish_join(child))
469 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
474 /* dispose of the left over mptcp master, if any */
475 if (unlikely(new_msk))
476 mptcp_force_close(new_msk);
480 tcp_send_active_reset(child, GFP_ATOMIC);
481 inet_csk_prepare_forced_close(child);
486 static struct inet_connection_sock_af_ops subflow_specific;
488 enum mapping_status {
495 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
497 if ((u32)seq == (u32)old_seq)
500 /* Assume map covers data not mapped yet. */
501 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
504 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
506 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
507 ssn, subflow->map_subflow_seq, subflow->map_data_len);
510 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
512 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
513 unsigned int skb_consumed;
515 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
516 if (WARN_ON_ONCE(skb_consumed >= skb->len))
519 return skb->len - skb_consumed <= subflow->map_data_len -
520 mptcp_subflow_get_map_offset(subflow);
523 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
525 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
526 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
528 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
529 /* Mapping covers data later in the subflow stream,
530 * currently unsupported.
532 warn_bad_map(subflow, ssn);
535 if (unlikely(!before(ssn, subflow->map_subflow_seq +
536 subflow->map_data_len))) {
537 /* Mapping does covers past subflow data, invalid */
538 warn_bad_map(subflow, ssn + skb->len);
544 static enum mapping_status get_mapping_status(struct sock *ssk)
546 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
547 struct mptcp_ext *mpext;
552 skb = skb_peek(&ssk->sk_receive_queue);
554 return MAPPING_EMPTY;
556 mpext = mptcp_get_ext(skb);
557 if (!mpext || !mpext->use_map) {
558 if (!subflow->map_valid && !skb->len) {
559 /* the TCP stack deliver 0 len FIN pkt to the receive
560 * queue, that is the only 0len pkts ever expected here,
561 * and we can admit no mapping only for 0 len pkts
563 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
564 WARN_ONCE(1, "0len seq %d:%d flags %x",
565 TCP_SKB_CB(skb)->seq,
566 TCP_SKB_CB(skb)->end_seq,
567 TCP_SKB_CB(skb)->tcp_flags);
568 sk_eat_skb(ssk, skb);
569 return MAPPING_EMPTY;
572 if (!subflow->map_valid)
573 return MAPPING_INVALID;
578 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
579 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
580 mpext->data_len, mpext->data_fin);
582 data_len = mpext->data_len;
584 pr_err("Infinite mapping not handled");
585 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
586 return MAPPING_INVALID;
589 if (mpext->data_fin == 1) {
591 pr_debug("DATA_FIN with no payload");
592 if (subflow->map_valid) {
593 /* A DATA_FIN might arrive in a DSS
594 * option before the previous mapping
595 * has been fully consumed. Continue
596 * handling the existing mapping.
598 skb_ext_del(skb, SKB_EXT_MPTCP);
601 return MAPPING_DATA_FIN;
605 /* Adjust for DATA_FIN using 1 byte of sequence space */
610 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
612 pr_debug("expanded seq=%llu", subflow->map_seq);
614 map_seq = mpext->data_seq;
617 if (subflow->map_valid) {
618 /* Allow replacing only with an identical map */
619 if (subflow->map_seq == map_seq &&
620 subflow->map_subflow_seq == mpext->subflow_seq &&
621 subflow->map_data_len == data_len) {
622 skb_ext_del(skb, SKB_EXT_MPTCP);
626 /* If this skb data are fully covered by the current mapping,
627 * the new map would need caching, which is not supported
629 if (skb_is_fully_mapped(ssk, skb)) {
630 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
631 return MAPPING_INVALID;
634 /* will validate the next map after consuming the current one */
638 subflow->map_seq = map_seq;
639 subflow->map_subflow_seq = mpext->subflow_seq;
640 subflow->map_data_len = data_len;
641 subflow->map_valid = 1;
642 subflow->mpc_map = mpext->mpc_map;
643 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
644 subflow->map_seq, subflow->map_subflow_seq,
645 subflow->map_data_len);
648 /* we revalidate valid mapping on new skb, because we must ensure
649 * the current skb is completely covered by the available mapping
651 if (!validate_mapping(ssk, skb))
652 return MAPPING_INVALID;
654 skb_ext_del(skb, SKB_EXT_MPTCP);
658 static int subflow_read_actor(read_descriptor_t *desc,
660 unsigned int offset, size_t len)
662 size_t copy_len = min(desc->count, len);
664 desc->count -= copy_len;
666 pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
670 static bool subflow_check_data_avail(struct sock *ssk)
672 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
673 enum mapping_status status;
674 struct mptcp_sock *msk;
677 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
678 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
679 if (subflow->data_avail)
682 msk = mptcp_sk(subflow->conn);
689 status = get_mapping_status(ssk);
690 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
691 if (status == MAPPING_INVALID) {
692 ssk->sk_err = EBADMSG;
696 if (status != MAPPING_OK)
699 skb = skb_peek(&ssk->sk_receive_queue);
700 if (WARN_ON_ONCE(!skb))
703 /* if msk lacks the remote key, this subflow must provide an
704 * MP_CAPABLE-based mapping
706 if (unlikely(!READ_ONCE(msk->can_ack))) {
707 if (!subflow->mpc_map) {
708 ssk->sk_err = EBADMSG;
711 WRITE_ONCE(msk->remote_key, subflow->remote_key);
712 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
713 WRITE_ONCE(msk->can_ack, true);
716 old_ack = READ_ONCE(msk->ack_seq);
717 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
718 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
720 if (ack_seq == old_ack)
723 /* only accept in-sequence mapping. Old values are spurious
724 * retransmission; we can hit "future" values on active backup
725 * subflow switch, we relay on retransmissions to get
727 * Cuncurrent subflows support will require subflow data
730 map_remaining = subflow->map_data_len -
731 mptcp_subflow_get_map_offset(subflow);
732 if (before64(ack_seq, old_ack))
733 delta = min_t(size_t, old_ack - ack_seq, map_remaining);
735 delta = min_t(size_t, ack_seq - old_ack, map_remaining);
737 /* discard mapped data */
738 pr_debug("discarding %zu bytes, current map len=%d", delta,
741 read_descriptor_t desc = {
746 ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
753 if (delta == map_remaining)
754 subflow->map_valid = 0;
760 /* fatal protocol error, close the socket */
761 /* This barrier is coupled with smp_rmb() in tcp_poll() */
763 ssk->sk_error_report(ssk);
764 tcp_set_state(ssk, TCP_CLOSE);
765 tcp_send_active_reset(ssk, GFP_ATOMIC);
769 bool mptcp_subflow_data_available(struct sock *sk)
771 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
774 /* check if current mapping is still valid */
775 if (subflow->map_valid &&
776 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
777 subflow->map_valid = 0;
778 subflow->data_avail = 0;
780 pr_debug("Done with mapping: seq=%u data_len=%u",
781 subflow->map_subflow_seq,
782 subflow->map_data_len);
785 if (!subflow_check_data_avail(sk)) {
786 subflow->data_avail = 0;
790 skb = skb_peek(&sk->sk_receive_queue);
791 subflow->data_avail = skb &&
792 before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
793 return subflow->data_avail;
796 static void subflow_data_ready(struct sock *sk)
798 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
799 struct sock *parent = subflow->conn;
801 if (!subflow->mp_capable && !subflow->mp_join) {
802 subflow->tcp_data_ready(sk);
804 parent->sk_data_ready(parent);
808 if (mptcp_subflow_data_available(sk))
809 mptcp_data_ready(parent, sk);
812 static void subflow_write_space(struct sock *sk)
814 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
815 struct sock *parent = subflow->conn;
817 sk_stream_write_space(sk);
818 if (sk_stream_is_writeable(sk)) {
819 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
820 smp_mb__after_atomic();
821 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
822 sk_stream_write_space(parent);
826 static struct inet_connection_sock_af_ops *
827 subflow_default_af_ops(struct sock *sk)
829 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
830 if (sk->sk_family == AF_INET6)
831 return &subflow_v6_specific;
833 return &subflow_specific;
836 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
837 void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
839 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
840 struct inet_connection_sock *icsk = inet_csk(sk);
841 struct inet_connection_sock_af_ops *target;
843 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
845 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
846 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
848 if (likely(icsk->icsk_af_ops == target))
851 subflow->icsk_af_ops = icsk->icsk_af_ops;
852 icsk->icsk_af_ops = target;
856 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
857 struct sockaddr_storage *addr)
859 memset(addr, 0, sizeof(*addr));
860 addr->ss_family = info->family;
861 if (addr->ss_family == AF_INET) {
862 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
864 in_addr->sin_addr = info->addr;
865 in_addr->sin_port = info->port;
867 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
868 else if (addr->ss_family == AF_INET6) {
869 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
871 in6_addr->sin6_addr = info->addr6;
872 in6_addr->sin6_port = info->port;
877 int __mptcp_subflow_connect(struct sock *sk, int ifindex,
878 const struct mptcp_addr_info *loc,
879 const struct mptcp_addr_info *remote)
881 struct mptcp_sock *msk = mptcp_sk(sk);
882 struct mptcp_subflow_context *subflow;
883 struct sockaddr_storage addr;
889 if (sk->sk_state != TCP_ESTABLISHED)
892 err = mptcp_subflow_create_socket(sk, &sf);
896 subflow = mptcp_subflow_ctx(sf->sk);
897 subflow->remote_key = msk->remote_key;
898 subflow->local_key = msk->local_key;
899 subflow->token = msk->token;
900 mptcp_info2sockaddr(loc, &addr);
902 addrlen = sizeof(struct sockaddr_in);
903 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
904 if (loc->family == AF_INET6)
905 addrlen = sizeof(struct sockaddr_in6);
907 sf->sk->sk_bound_dev_if = ifindex;
908 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
912 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
913 pr_debug("msk=%p remote_token=%u", msk, remote_token);
914 subflow->remote_token = remote_token;
915 subflow->local_id = loc->id;
916 subflow->request_join = 1;
917 subflow->request_bkup = 1;
918 mptcp_info2sockaddr(remote, &addr);
920 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
921 if (err && err != -EINPROGRESS)
924 spin_lock_bh(&msk->join_list_lock);
925 list_add_tail(&subflow->node, &msk->join_list);
926 spin_unlock_bh(&msk->join_list_lock);
935 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
937 struct mptcp_subflow_context *subflow;
938 struct net *net = sock_net(sk);
942 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
949 /* kernel sockets do not by default acquire net ref, but TCP timer
952 sf->sk->sk_net_refcnt = 1;
954 #ifdef CONFIG_PROC_FS
955 this_cpu_add(*net->core.sock_inuse, 1);
957 err = tcp_set_ulp(sf->sk, "mptcp");
958 release_sock(sf->sk);
963 subflow = mptcp_subflow_ctx(sf->sk);
964 pr_debug("subflow=%p", subflow);
973 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
976 struct inet_connection_sock *icsk = inet_csk(sk);
977 struct mptcp_subflow_context *ctx;
979 ctx = kzalloc(sizeof(*ctx), priority);
983 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
984 INIT_LIST_HEAD(&ctx->node);
986 pr_debug("subflow=%p", ctx);
993 static void __subflow_state_change(struct sock *sk)
995 struct socket_wq *wq;
998 wq = rcu_dereference(sk->sk_wq);
999 if (skwq_has_sleeper(wq))
1000 wake_up_interruptible_all(&wq->wait);
1004 static bool subflow_is_done(const struct sock *sk)
1006 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1009 static void subflow_state_change(struct sock *sk)
1011 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1012 struct sock *parent = subflow->conn;
1014 __subflow_state_change(sk);
1016 /* as recvmsg() does not acquire the subflow socket for ssk selection
1017 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1018 * the data available machinery here.
1020 if (subflow->mp_capable && mptcp_subflow_data_available(sk))
1021 mptcp_data_ready(parent, sk);
1023 if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
1024 !subflow->rx_eof && subflow_is_done(sk)) {
1025 subflow->rx_eof = 1;
1026 mptcp_subflow_eof(parent);
1030 static int subflow_ulp_init(struct sock *sk)
1032 struct inet_connection_sock *icsk = inet_csk(sk);
1033 struct mptcp_subflow_context *ctx;
1034 struct tcp_sock *tp = tcp_sk(sk);
1037 /* disallow attaching ULP to a socket unless it has been
1038 * created with sock_create_kern()
1040 if (!sk->sk_kern_sock) {
1045 ctx = subflow_create_ctx(sk, GFP_KERNEL);
1051 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1054 ctx->icsk_af_ops = icsk->icsk_af_ops;
1055 icsk->icsk_af_ops = subflow_default_af_ops(sk);
1056 ctx->tcp_data_ready = sk->sk_data_ready;
1057 ctx->tcp_state_change = sk->sk_state_change;
1058 ctx->tcp_write_space = sk->sk_write_space;
1059 sk->sk_data_ready = subflow_data_ready;
1060 sk->sk_write_space = subflow_write_space;
1061 sk->sk_state_change = subflow_state_change;
1066 static void subflow_ulp_release(struct sock *sk)
1068 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
1074 sock_put(ctx->conn);
1076 kfree_rcu(ctx, rcu);
1079 static void subflow_ulp_fallback(struct sock *sk,
1080 struct mptcp_subflow_context *old_ctx)
1082 struct inet_connection_sock *icsk = inet_csk(sk);
1084 mptcp_subflow_tcp_fallback(sk, old_ctx);
1085 icsk->icsk_ulp_ops = NULL;
1086 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
1087 tcp_sk(sk)->is_mptcp = 0;
1090 static void subflow_ulp_clone(const struct request_sock *req,
1092 const gfp_t priority)
1094 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1095 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1096 struct mptcp_subflow_context *new_ctx;
1098 if (!tcp_rsk(req)->is_mptcp ||
1099 (!subflow_req->mp_capable && !subflow_req->mp_join)) {
1100 subflow_ulp_fallback(newsk, old_ctx);
1104 new_ctx = subflow_create_ctx(newsk, priority);
1106 subflow_ulp_fallback(newsk, old_ctx);
1110 new_ctx->conn_finished = 1;
1111 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
1112 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1113 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1114 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
1115 new_ctx->rel_write_seq = 1;
1116 new_ctx->tcp_sock = newsk;
1118 if (subflow_req->mp_capable) {
1119 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1120 * is fully established only after we receive the remote key
1122 new_ctx->mp_capable = 1;
1123 new_ctx->fully_established = subflow_req->remote_key_valid;
1124 new_ctx->can_ack = subflow_req->remote_key_valid;
1125 new_ctx->remote_key = subflow_req->remote_key;
1126 new_ctx->local_key = subflow_req->local_key;
1127 new_ctx->token = subflow_req->token;
1128 new_ctx->ssn_offset = subflow_req->ssn_offset;
1129 new_ctx->idsn = subflow_req->idsn;
1130 } else if (subflow_req->mp_join) {
1131 new_ctx->ssn_offset = subflow_req->ssn_offset;
1132 new_ctx->mp_join = 1;
1133 new_ctx->fully_established = 1;
1134 new_ctx->backup = subflow_req->backup;
1135 new_ctx->local_id = subflow_req->local_id;
1136 new_ctx->token = subflow_req->token;
1137 new_ctx->thmac = subflow_req->thmac;
1141 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1143 .owner = THIS_MODULE,
1144 .init = subflow_ulp_init,
1145 .release = subflow_ulp_release,
1146 .clone = subflow_ulp_clone,
1149 static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1151 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1152 subflow_ops->slab_name = "request_sock_subflow";
1154 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1155 subflow_ops->obj_size, 0,
1157 SLAB_TYPESAFE_BY_RCU,
1159 if (!subflow_ops->slab)
1162 subflow_ops->destructor = subflow_req_destructor;
1167 void mptcp_subflow_init(void)
1169 subflow_request_sock_ops = tcp_request_sock_ops;
1170 if (subflow_ops_init(&subflow_request_sock_ops) != 0)
1171 panic("MPTCP: failed to init subflow request sock ops\n");
1173 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1174 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
1176 subflow_specific = ipv4_specific;
1177 subflow_specific.conn_request = subflow_v4_conn_request;
1178 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1179 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1180 subflow_specific.rebuild_header = subflow_rebuild_header;
1182 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1183 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1184 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
1186 subflow_v6_specific = ipv6_specific;
1187 subflow_v6_specific.conn_request = subflow_v6_conn_request;
1188 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1189 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1190 subflow_v6_specific.rebuild_header = subflow_rebuild_header;
1192 subflow_v6m_specific = subflow_v6_specific;
1193 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1194 subflow_v6m_specific.send_check = ipv4_specific.send_check;
1195 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1196 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1197 subflow_v6m_specific.net_frag_header_len = 0;
1200 mptcp_diag_subflow_init(&subflow_ulp_ops);
1202 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1203 panic("MPTCP: failed to register subflows to ULP\n");