1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 tcp_hdr(skb)->source);
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
189 if (addr_len < sizeof(struct sockaddr_in))
192 sock_owned_by_me(sk);
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211 if (addr_len < sizeof(struct sockaddr_in))
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
223 nexthop = inet_opt->opt.faddr;
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 orig_sport, orig_dport, sk);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 if (!inet_opt || !inet_opt->opt.srr)
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
293 if (likely(!tp->repair)) {
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
305 inet->inet_id = prandom_u32();
307 if (tcp_fastopen_defer_connect(sk, &err))
312 err = tcp_connect(sk);
321 * This unhashes the socket and releases the local port,
324 tcp_set_state(sk, TCP_CLOSE);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
330 EXPORT_SYMBOL(tcp_v4_connect);
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
337 void tcp_v4_mtu_reduced(struct sock *sk)
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
378 dst->ops->redirect(dst, sk, skb);
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
405 EXPORT_SYMBOL(tcp_req_err);
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
416 if (sock_owned_by_user(sk))
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
442 tcp_retransmit_timer(sk);
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
472 struct request_sock *fastopen;
475 struct net *net = dev_net(skb->dev);
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 if (sk->sk_state == TCP_CLOSE)
511 if (static_branch_unlikely(&ip4_min_ttl)) {
512 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
513 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
514 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
520 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
521 fastopen = rcu_dereference(tp->fastopen_rsk);
522 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
523 if (sk->sk_state != TCP_LISTEN &&
524 !between(seq, snd_una, tp->snd_nxt)) {
525 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
531 if (!sock_owned_by_user(sk))
532 do_redirect(skb, sk);
534 case ICMP_SOURCE_QUENCH:
535 /* Just silently ignore these. */
537 case ICMP_PARAMETERPROB:
540 case ICMP_DEST_UNREACH:
541 if (code > NR_ICMP_UNREACH)
544 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
545 /* We are not interested in TCP_LISTEN and open_requests
546 * (SYN-ACKs send out by Linux are always <576bytes so
547 * they should go through unfragmented).
549 if (sk->sk_state == TCP_LISTEN)
552 WRITE_ONCE(tp->mtu_info, info);
553 if (!sock_owned_by_user(sk)) {
554 tcp_v4_mtu_reduced(sk);
556 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
562 err = icmp_err_convert[code].errno;
563 /* check if this ICMP message allows revert of backoff.
567 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
568 tcp_ld_RTO_revert(sk, seq);
570 case ICMP_TIME_EXCEEDED:
577 switch (sk->sk_state) {
580 /* Only in fast or simultaneous open. If a fast open socket is
581 * already accepted it is treated as a connected one below.
583 if (fastopen && !fastopen->sk)
586 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588 if (!sock_owned_by_user(sk)) {
595 sk->sk_err_soft = err;
600 /* If we've already connected we will keep trying
601 * until we time out, or the user gives up.
603 * rfc1122 4.2.3.9 allows to consider as hard errors
604 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
605 * but it is obsoleted by pmtu discovery).
607 * Note, that in modern internet, where routing is unreliable
608 * and in each dark corner broken firewalls sit, sending random
609 * errors ordered by their masters even this two messages finally lose
610 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 * Now we are in compliance with RFCs.
617 if (!sock_owned_by_user(sk) && inet->recverr) {
620 } else { /* Only an error on timeout */
621 sk->sk_err_soft = err;
630 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 struct tcphdr *th = tcp_hdr(skb);
634 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
635 skb->csum_start = skb_transport_header(skb) - skb->head;
636 skb->csum_offset = offsetof(struct tcphdr, check);
639 /* This routine computes an IPv4 TCP checksum. */
640 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 const struct inet_sock *inet = inet_sk(sk);
644 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 EXPORT_SYMBOL(tcp_v4_send_check);
649 * This routine will send an RST to the other tcp.
651 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653 * Answer: if a packet caused RST, it is not for a socket
654 * existing in our system, if it is matched to a socket,
655 * it is just duplicate segment or bug in other side's TCP.
656 * So that we build reply only basing on parameters
657 * arrived with segment.
658 * Exception: precedence violation. We do not implement it in any case.
661 #ifdef CONFIG_TCP_MD5SIG
662 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664 #define OPTION_BYTES sizeof(__be32)
667 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 const struct tcphdr *th = tcp_hdr(skb);
672 __be32 opt[OPTION_BYTES / sizeof(__be32)];
674 struct ip_reply_arg arg;
675 #ifdef CONFIG_TCP_MD5SIG
676 struct tcp_md5sig_key *key = NULL;
677 const __u8 *hash_location = NULL;
678 unsigned char newhash[16];
680 struct sock *sk1 = NULL;
682 u64 transmit_time = 0;
686 /* Never send a reset in response to a reset. */
690 /* If sk not NULL, it means we did a successful lookup and incoming
691 * route had to be correct. prequeue might have dropped our dst.
693 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
696 /* Swap the send and the receive. */
697 memset(&rep, 0, sizeof(rep));
698 rep.th.dest = th->source;
699 rep.th.source = th->dest;
700 rep.th.doff = sizeof(struct tcphdr) / 4;
704 rep.th.seq = th->ack_seq;
707 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
708 skb->len - (th->doff << 2));
711 memset(&arg, 0, sizeof(arg));
712 arg.iov[0].iov_base = (unsigned char *)&rep;
713 arg.iov[0].iov_len = sizeof(rep.th);
715 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
716 #ifdef CONFIG_TCP_MD5SIG
718 hash_location = tcp_parse_md5sig_option(th);
719 if (sk && sk_fullsock(sk)) {
720 const union tcp_md5_addr *addr;
723 /* sdif set, means packet ingressed via a device
724 * in an L3 domain and inet_iif is set to it.
726 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
727 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
728 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
729 } else if (hash_location) {
730 const union tcp_md5_addr *addr;
731 int sdif = tcp_v4_sdif(skb);
732 int dif = inet_iif(skb);
736 * active side is lost. Try to find listening socket through
737 * source port, and then find md5 key through listening socket.
738 * we are not loose security here:
739 * Incoming packet is checked with md5 hash with finding key,
740 * no RST generated if md5 hash doesn't match.
742 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 th->source, ip_hdr(skb)->daddr,
745 ntohs(th->source), dif, sdif);
746 /* don't send rst if it can't find key */
750 /* sdif set, means packet ingressed via a device
751 * in an L3 domain and dif is set to it.
753 l3index = sdif ? dif : 0;
754 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
755 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
760 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
761 if (genhash || memcmp(hash_location, newhash, 16) != 0)
767 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 (TCPOPT_MD5SIG << 8) |
771 /* Update length and the length the header thinks exists */
772 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
773 rep.th.doff = arg.iov[0].iov_len / 4;
775 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
776 key, ip_hdr(skb)->saddr,
777 ip_hdr(skb)->daddr, &rep.th);
780 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
781 if (rep.opt[0] == 0) {
782 __be32 mrst = mptcp_reset_option(skb);
786 arg.iov[0].iov_len += sizeof(mrst);
787 rep.th.doff = arg.iov[0].iov_len / 4;
791 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792 ip_hdr(skb)->saddr, /* XXX */
793 arg.iov[0].iov_len, IPPROTO_TCP, 0);
794 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797 /* When socket is gone, all binding information is lost.
798 * routing might fail in this case. No choice here, if we choose to force
799 * input interface, we will misroute in case of asymmetric route.
802 arg.bound_dev_if = sk->sk_bound_dev_if;
804 trace_tcp_send_reset(sk, skb);
807 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
808 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810 arg.tos = ip_hdr(skb)->tos;
811 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
815 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
816 inet_twsk(sk)->tw_mark : sk->sk_mark;
817 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_priority : sk->sk_priority;
819 transmit_time = tcp_transmit_time(sk);
821 ip_send_unicast_reply(ctl_sk,
822 skb, &TCP_SKB_CB(skb)->header.h4.opt,
823 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
824 &arg, arg.iov[0].iov_len,
828 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
829 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
832 #ifdef CONFIG_TCP_MD5SIG
838 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
839 outside socket context is ugly, certainly. What can I do?
842 static void tcp_v4_send_ack(const struct sock *sk,
843 struct sk_buff *skb, u32 seq, u32 ack,
844 u32 win, u32 tsval, u32 tsecr, int oif,
845 struct tcp_md5sig_key *key,
846 int reply_flags, u8 tos)
848 const struct tcphdr *th = tcp_hdr(skb);
851 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
852 #ifdef CONFIG_TCP_MD5SIG
853 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
857 struct net *net = sock_net(sk);
858 struct ip_reply_arg arg;
862 memset(&rep.th, 0, sizeof(struct tcphdr));
863 memset(&arg, 0, sizeof(arg));
865 arg.iov[0].iov_base = (unsigned char *)&rep;
866 arg.iov[0].iov_len = sizeof(rep.th);
868 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
869 (TCPOPT_TIMESTAMP << 8) |
871 rep.opt[1] = htonl(tsval);
872 rep.opt[2] = htonl(tsecr);
873 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
876 /* Swap the send and the receive. */
877 rep.th.dest = th->source;
878 rep.th.source = th->dest;
879 rep.th.doff = arg.iov[0].iov_len / 4;
880 rep.th.seq = htonl(seq);
881 rep.th.ack_seq = htonl(ack);
883 rep.th.window = htons(win);
885 #ifdef CONFIG_TCP_MD5SIG
887 int offset = (tsecr) ? 3 : 0;
889 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
891 (TCPOPT_MD5SIG << 8) |
893 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
894 rep.th.doff = arg.iov[0].iov_len/4;
896 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
897 key, ip_hdr(skb)->saddr,
898 ip_hdr(skb)->daddr, &rep.th);
901 arg.flags = reply_flags;
902 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
903 ip_hdr(skb)->saddr, /* XXX */
904 arg.iov[0].iov_len, IPPROTO_TCP, 0);
905 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
907 arg.bound_dev_if = oif;
909 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
911 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
912 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
913 inet_twsk(sk)->tw_mark : sk->sk_mark;
914 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
915 inet_twsk(sk)->tw_priority : sk->sk_priority;
916 transmit_time = tcp_transmit_time(sk);
917 ip_send_unicast_reply(ctl_sk,
918 skb, &TCP_SKB_CB(skb)->header.h4.opt,
919 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
920 &arg, arg.iov[0].iov_len,
924 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
928 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
930 struct inet_timewait_sock *tw = inet_twsk(sk);
931 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
933 tcp_v4_send_ack(sk, skb,
934 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
935 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
936 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
939 tcp_twsk_md5_key(tcptw),
940 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
947 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
948 struct request_sock *req)
950 const union tcp_md5_addr *addr;
953 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
954 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
956 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
960 * The window field (SEG.WND) of every outgoing segment, with the
961 * exception of <SYN> segments, MUST be right-shifted by
962 * Rcv.Wind.Shift bits:
964 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
965 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
966 tcp_v4_send_ack(sk, skb, seq,
967 tcp_rsk(req)->rcv_nxt,
968 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
969 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
972 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
973 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
978 * Send a SYN-ACK after having received a SYN.
979 * This still operates on a request_sock only, not on a big
982 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
984 struct request_sock *req,
985 struct tcp_fastopen_cookie *foc,
986 enum tcp_synack_type synack_type,
987 struct sk_buff *syn_skb)
989 const struct inet_request_sock *ireq = inet_rsk(req);
995 /* First, grab a route. */
996 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
999 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1002 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1004 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1005 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1006 (inet_sk(sk)->tos & INET_ECN_MASK) :
1009 if (!INET_ECN_is_capable(tos) &&
1010 tcp_bpf_ca_needs_ecn((struct sock *)req))
1011 tos |= INET_ECN_ECT_0;
1014 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1016 rcu_dereference(ireq->ireq_opt),
1019 err = net_xmit_eval(err);
1026 * IPv4 request_sock destructor.
1028 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1030 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1033 #ifdef CONFIG_TCP_MD5SIG
1035 * RFC2385 MD5 checksumming requires a mapping of
1036 * IP address->MD5 Key.
1037 * We need to maintain these in the sk structure.
1040 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1041 EXPORT_SYMBOL(tcp_md5_needed);
1043 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1048 /* l3index always overrides non-l3index */
1049 if (old->l3index && new->l3index == 0)
1051 if (old->l3index == 0 && new->l3index)
1054 return old->prefixlen < new->prefixlen;
1057 /* Find the Key structure for an address. */
1058 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1059 const union tcp_md5_addr *addr,
1062 const struct tcp_sock *tp = tcp_sk(sk);
1063 struct tcp_md5sig_key *key;
1064 const struct tcp_md5sig_info *md5sig;
1066 struct tcp_md5sig_key *best_match = NULL;
1069 /* caller either holds rcu_read_lock() or socket lock */
1070 md5sig = rcu_dereference_check(tp->md5sig_info,
1071 lockdep_sock_is_held(sk));
1075 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1076 lockdep_sock_is_held(sk)) {
1077 if (key->family != family)
1079 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1081 if (family == AF_INET) {
1082 mask = inet_make_mask(key->prefixlen);
1083 match = (key->addr.a4.s_addr & mask) ==
1084 (addr->a4.s_addr & mask);
1085 #if IS_ENABLED(CONFIG_IPV6)
1086 } else if (family == AF_INET6) {
1087 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1094 if (match && better_md5_match(best_match, key))
1099 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1101 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1102 const union tcp_md5_addr *addr,
1103 int family, u8 prefixlen,
1104 int l3index, u8 flags)
1106 const struct tcp_sock *tp = tcp_sk(sk);
1107 struct tcp_md5sig_key *key;
1108 unsigned int size = sizeof(struct in_addr);
1109 const struct tcp_md5sig_info *md5sig;
1111 /* caller either holds rcu_read_lock() or socket lock */
1112 md5sig = rcu_dereference_check(tp->md5sig_info,
1113 lockdep_sock_is_held(sk));
1116 #if IS_ENABLED(CONFIG_IPV6)
1117 if (family == AF_INET6)
1118 size = sizeof(struct in6_addr);
1120 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1121 lockdep_sock_is_held(sk)) {
1122 if (key->family != family)
1124 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1126 if (key->l3index != l3index)
1128 if (!memcmp(&key->addr, addr, size) &&
1129 key->prefixlen == prefixlen)
1135 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1136 const struct sock *addr_sk)
1138 const union tcp_md5_addr *addr;
1141 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1142 addr_sk->sk_bound_dev_if);
1143 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1144 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1146 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1148 /* This can be called on a newly created socket, from other files */
1149 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1150 int family, u8 prefixlen, int l3index, u8 flags,
1151 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1153 /* Add Key to the list */
1154 struct tcp_md5sig_key *key;
1155 struct tcp_sock *tp = tcp_sk(sk);
1156 struct tcp_md5sig_info *md5sig;
1158 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1160 /* Pre-existing entry - just update that one.
1161 * Note that the key might be used concurrently.
1162 * data_race() is telling kcsan that we do not care of
1163 * key mismatches, since changing MD5 key on live flows
1164 * can lead to packet drops.
1166 data_race(memcpy(key->key, newkey, newkeylen));
1168 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1169 * Also note that a reader could catch new key->keylen value
1170 * but old key->key[], this is the reason we use __GFP_ZERO
1171 * at sock_kmalloc() time below these lines.
1173 WRITE_ONCE(key->keylen, newkeylen);
1178 md5sig = rcu_dereference_protected(tp->md5sig_info,
1179 lockdep_sock_is_held(sk));
1181 md5sig = kmalloc(sizeof(*md5sig), gfp);
1186 INIT_HLIST_HEAD(&md5sig->head);
1187 rcu_assign_pointer(tp->md5sig_info, md5sig);
1190 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1193 if (!tcp_alloc_md5sig_pool()) {
1194 sock_kfree_s(sk, key, sizeof(*key));
1198 memcpy(key->key, newkey, newkeylen);
1199 key->keylen = newkeylen;
1200 key->family = family;
1201 key->prefixlen = prefixlen;
1202 key->l3index = l3index;
1204 memcpy(&key->addr, addr,
1205 (family == AF_INET6) ? sizeof(struct in6_addr) :
1206 sizeof(struct in_addr));
1207 hlist_add_head_rcu(&key->node, &md5sig->head);
1210 EXPORT_SYMBOL(tcp_md5_do_add);
1212 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1213 u8 prefixlen, int l3index, u8 flags)
1215 struct tcp_md5sig_key *key;
1217 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1220 hlist_del_rcu(&key->node);
1221 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1222 kfree_rcu(key, rcu);
1225 EXPORT_SYMBOL(tcp_md5_do_del);
1227 static void tcp_clear_md5_list(struct sock *sk)
1229 struct tcp_sock *tp = tcp_sk(sk);
1230 struct tcp_md5sig_key *key;
1231 struct hlist_node *n;
1232 struct tcp_md5sig_info *md5sig;
1234 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1236 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1237 hlist_del_rcu(&key->node);
1238 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1239 kfree_rcu(key, rcu);
1243 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1244 sockptr_t optval, int optlen)
1246 struct tcp_md5sig cmd;
1247 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1248 const union tcp_md5_addr *addr;
1253 if (optlen < sizeof(cmd))
1256 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1259 if (sin->sin_family != AF_INET)
1262 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1264 if (optname == TCP_MD5SIG_EXT &&
1265 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1266 prefixlen = cmd.tcpm_prefixlen;
1271 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1273 struct net_device *dev;
1276 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1277 if (dev && netif_is_l3_master(dev))
1278 l3index = dev->ifindex;
1282 /* ok to reference set/not set outside of rcu;
1283 * right now device MUST be an L3 master
1285 if (!dev || !l3index)
1289 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1291 if (!cmd.tcpm_keylen)
1292 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1294 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1297 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1298 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1301 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1302 __be32 daddr, __be32 saddr,
1303 const struct tcphdr *th, int nbytes)
1305 struct tcp4_pseudohdr *bp;
1306 struct scatterlist sg;
1313 bp->protocol = IPPROTO_TCP;
1314 bp->len = cpu_to_be16(nbytes);
1316 _th = (struct tcphdr *)(bp + 1);
1317 memcpy(_th, th, sizeof(*th));
1320 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1321 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1322 sizeof(*bp) + sizeof(*th));
1323 return crypto_ahash_update(hp->md5_req);
1326 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1327 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1329 struct tcp_md5sig_pool *hp;
1330 struct ahash_request *req;
1332 hp = tcp_get_md5sig_pool();
1334 goto clear_hash_noput;
1337 if (crypto_ahash_init(req))
1339 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1341 if (tcp_md5_hash_key(hp, key))
1343 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1344 if (crypto_ahash_final(req))
1347 tcp_put_md5sig_pool();
1351 tcp_put_md5sig_pool();
1353 memset(md5_hash, 0, 16);
1357 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1358 const struct sock *sk,
1359 const struct sk_buff *skb)
1361 struct tcp_md5sig_pool *hp;
1362 struct ahash_request *req;
1363 const struct tcphdr *th = tcp_hdr(skb);
1364 __be32 saddr, daddr;
1366 if (sk) { /* valid for establish/request sockets */
1367 saddr = sk->sk_rcv_saddr;
1368 daddr = sk->sk_daddr;
1370 const struct iphdr *iph = ip_hdr(skb);
1375 hp = tcp_get_md5sig_pool();
1377 goto clear_hash_noput;
1380 if (crypto_ahash_init(req))
1383 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1385 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1387 if (tcp_md5_hash_key(hp, key))
1389 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1390 if (crypto_ahash_final(req))
1393 tcp_put_md5sig_pool();
1397 tcp_put_md5sig_pool();
1399 memset(md5_hash, 0, 16);
1402 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1406 /* Called with rcu_read_lock() */
1407 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1408 const struct sk_buff *skb,
1411 #ifdef CONFIG_TCP_MD5SIG
1413 * This gets called for each TCP segment that arrives
1414 * so we want to be efficient.
1415 * We have 3 drop cases:
1416 * o No MD5 hash and one expected.
1417 * o MD5 hash and we're not expecting one.
1418 * o MD5 hash and its wrong.
1420 const __u8 *hash_location = NULL;
1421 struct tcp_md5sig_key *hash_expected;
1422 const struct iphdr *iph = ip_hdr(skb);
1423 const struct tcphdr *th = tcp_hdr(skb);
1424 const union tcp_md5_addr *addr;
1425 unsigned char newhash[16];
1426 int genhash, l3index;
1428 /* sdif set, means packet ingressed via a device
1429 * in an L3 domain and dif is set to the l3mdev
1431 l3index = sdif ? dif : 0;
1433 addr = (union tcp_md5_addr *)&iph->saddr;
1434 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1435 hash_location = tcp_parse_md5sig_option(th);
1437 /* We've parsed the options - do we have a hash? */
1438 if (!hash_expected && !hash_location)
1441 if (hash_expected && !hash_location) {
1442 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1446 if (!hash_expected && hash_location) {
1447 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1451 /* Okay, so this is hash_expected and hash_location -
1452 * so we need to calculate the checksum.
1454 genhash = tcp_v4_md5_hash_skb(newhash,
1458 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1459 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1460 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1461 &iph->saddr, ntohs(th->source),
1462 &iph->daddr, ntohs(th->dest),
1463 genhash ? " tcp_v4_calc_md5_hash failed"
1472 static void tcp_v4_init_req(struct request_sock *req,
1473 const struct sock *sk_listener,
1474 struct sk_buff *skb)
1476 struct inet_request_sock *ireq = inet_rsk(req);
1477 struct net *net = sock_net(sk_listener);
1479 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1480 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1481 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1484 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1485 struct sk_buff *skb,
1487 struct request_sock *req)
1489 tcp_v4_init_req(req, sk, skb);
1491 if (security_inet_conn_request(sk, skb, req))
1494 return inet_csk_route_req(sk, &fl->u.ip4, req);
1497 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1499 .obj_size = sizeof(struct tcp_request_sock),
1500 .rtx_syn_ack = tcp_rtx_synack,
1501 .send_ack = tcp_v4_reqsk_send_ack,
1502 .destructor = tcp_v4_reqsk_destructor,
1503 .send_reset = tcp_v4_send_reset,
1504 .syn_ack_timeout = tcp_syn_ack_timeout,
1507 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1508 .mss_clamp = TCP_MSS_DEFAULT,
1509 #ifdef CONFIG_TCP_MD5SIG
1510 .req_md5_lookup = tcp_v4_md5_lookup,
1511 .calc_md5_hash = tcp_v4_md5_hash_skb,
1513 #ifdef CONFIG_SYN_COOKIES
1514 .cookie_init_seq = cookie_v4_init_sequence,
1516 .route_req = tcp_v4_route_req,
1517 .init_seq = tcp_v4_init_seq,
1518 .init_ts_off = tcp_v4_init_ts_off,
1519 .send_synack = tcp_v4_send_synack,
1522 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1524 /* Never answer to SYNs send to broadcast or multicast */
1525 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1528 return tcp_conn_request(&tcp_request_sock_ops,
1529 &tcp_request_sock_ipv4_ops, sk, skb);
1535 EXPORT_SYMBOL(tcp_v4_conn_request);
1539 * The three way handshake has completed - we got a valid synack -
1540 * now create the new socket.
1542 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1543 struct request_sock *req,
1544 struct dst_entry *dst,
1545 struct request_sock *req_unhash,
1548 struct inet_request_sock *ireq;
1549 bool found_dup_sk = false;
1550 struct inet_sock *newinet;
1551 struct tcp_sock *newtp;
1553 #ifdef CONFIG_TCP_MD5SIG
1554 const union tcp_md5_addr *addr;
1555 struct tcp_md5sig_key *key;
1558 struct ip_options_rcu *inet_opt;
1560 if (sk_acceptq_is_full(sk))
1563 newsk = tcp_create_openreq_child(sk, req, skb);
1567 newsk->sk_gso_type = SKB_GSO_TCPV4;
1568 inet_sk_rx_dst_set(newsk, skb);
1570 newtp = tcp_sk(newsk);
1571 newinet = inet_sk(newsk);
1572 ireq = inet_rsk(req);
1573 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1574 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1575 newsk->sk_bound_dev_if = ireq->ir_iif;
1576 newinet->inet_saddr = ireq->ir_loc_addr;
1577 inet_opt = rcu_dereference(ireq->ireq_opt);
1578 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1579 newinet->mc_index = inet_iif(skb);
1580 newinet->mc_ttl = ip_hdr(skb)->ttl;
1581 newinet->rcv_tos = ip_hdr(skb)->tos;
1582 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1584 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1585 newinet->inet_id = prandom_u32();
1587 /* Set ToS of the new socket based upon the value of incoming SYN.
1588 * ECT bits are set later in tcp_init_transfer().
1590 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1591 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1594 dst = inet_csk_route_child_sock(sk, newsk, req);
1598 /* syncookie case : see end of cookie_v4_check() */
1600 sk_setup_caps(newsk, dst);
1602 tcp_ca_openreq_child(newsk, dst);
1604 tcp_sync_mss(newsk, dst_mtu(dst));
1605 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1607 tcp_initialize_rcv_mss(newsk);
1609 #ifdef CONFIG_TCP_MD5SIG
1610 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1611 /* Copy over the MD5 key from the original socket */
1612 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1613 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1616 * We're using one, so create a matching key
1617 * on the newsk structure. If we fail to get
1618 * memory, then we end up not copying the key
1621 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1622 key->key, key->keylen, GFP_ATOMIC);
1623 sk_gso_disable(newsk);
1627 if (__inet_inherit_port(sk, newsk) < 0)
1629 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1631 if (likely(*own_req)) {
1632 tcp_move_syn(newtp, req);
1633 ireq->ireq_opt = NULL;
1635 newinet->inet_opt = NULL;
1637 if (!req_unhash && found_dup_sk) {
1638 /* This code path should only be executed in the
1639 * syncookie case only
1641 bh_unlock_sock(newsk);
1649 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1656 newinet->inet_opt = NULL;
1657 inet_csk_prepare_forced_close(newsk);
1661 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1663 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1665 #ifdef CONFIG_SYN_COOKIES
1666 const struct tcphdr *th = tcp_hdr(skb);
1669 sk = cookie_v4_check(sk, skb);
1674 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1675 struct tcphdr *th, u32 *cookie)
1678 #ifdef CONFIG_SYN_COOKIES
1679 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1680 &tcp_request_sock_ipv4_ops, sk, th);
1682 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1683 tcp_synq_overflow(sk);
1689 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1691 /* The socket must have it's spinlock held when we get
1692 * here, unless it is a TCP_LISTEN socket.
1694 * We have a potential double-lock case here, so even when
1695 * doing backlog processing we use the BH locking scheme.
1696 * This is because we cannot sleep with the original spinlock
1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1703 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1704 struct dst_entry *dst;
1706 dst = rcu_dereference_protected(sk->sk_rx_dst,
1707 lockdep_sock_is_held(sk));
1709 sock_rps_save_rxhash(sk, skb);
1710 sk_mark_napi_id(sk, skb);
1712 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1713 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1715 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1719 tcp_rcv_established(sk, skb);
1723 if (tcp_checksum_complete(skb))
1726 if (sk->sk_state == TCP_LISTEN) {
1727 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1732 if (tcp_child_process(sk, nsk, skb)) {
1739 sock_rps_save_rxhash(sk, skb);
1741 if (tcp_rcv_state_process(sk, skb)) {
1748 tcp_v4_send_reset(rsk, skb);
1751 /* Be careful here. If this function gets more complicated and
1752 * gcc suffers from register pressure on the x86, sk (in %ebx)
1753 * might be destroyed here. This current version compiles correctly,
1754 * but you have been warned.
1759 trace_tcp_bad_csum(skb);
1760 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1764 EXPORT_SYMBOL(tcp_v4_do_rcv);
1766 int tcp_v4_early_demux(struct sk_buff *skb)
1768 const struct iphdr *iph;
1769 const struct tcphdr *th;
1772 if (skb->pkt_type != PACKET_HOST)
1775 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1781 if (th->doff < sizeof(struct tcphdr) / 4)
1784 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1785 iph->saddr, th->source,
1786 iph->daddr, ntohs(th->dest),
1787 skb->skb_iif, inet_sdif(skb));
1790 skb->destructor = sock_edemux;
1791 if (sk_fullsock(sk)) {
1792 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1795 dst = dst_check(dst, 0);
1797 sk->sk_rx_dst_ifindex == skb->skb_iif)
1798 skb_dst_set_noref(skb, dst);
1804 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1806 u32 limit, tail_gso_size, tail_gso_segs;
1807 struct skb_shared_info *shinfo;
1808 const struct tcphdr *th;
1809 struct tcphdr *thtail;
1810 struct sk_buff *tail;
1811 unsigned int hdrlen;
1817 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1818 * we can fix skb->truesize to its real value to avoid future drops.
1819 * This is valid because skb is not yet charged to the socket.
1820 * It has been noticed pure SACK packets were sometimes dropped
1821 * (if cooked by drivers without copybreak feature).
1827 if (unlikely(tcp_checksum_complete(skb))) {
1829 trace_tcp_bad_csum(skb);
1830 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1831 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1835 /* Attempt coalescing to last skb in backlog, even if we are
1837 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1839 th = (const struct tcphdr *)skb->data;
1840 hdrlen = th->doff * 4;
1842 tail = sk->sk_backlog.tail;
1845 thtail = (struct tcphdr *)tail->data;
1847 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1848 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1849 ((TCP_SKB_CB(tail)->tcp_flags |
1850 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1851 !((TCP_SKB_CB(tail)->tcp_flags &
1852 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1853 ((TCP_SKB_CB(tail)->tcp_flags ^
1854 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1855 #ifdef CONFIG_TLS_DEVICE
1856 tail->decrypted != skb->decrypted ||
1858 thtail->doff != th->doff ||
1859 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1862 __skb_pull(skb, hdrlen);
1864 shinfo = skb_shinfo(skb);
1865 gso_size = shinfo->gso_size ?: skb->len;
1866 gso_segs = shinfo->gso_segs ?: 1;
1868 shinfo = skb_shinfo(tail);
1869 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1870 tail_gso_segs = shinfo->gso_segs ?: 1;
1872 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1873 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1875 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1876 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1877 thtail->window = th->window;
1880 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1881 * thtail->fin, so that the fast path in tcp_rcv_established()
1882 * is not entered if we append a packet with a FIN.
1883 * SYN, RST, URG are not present.
1884 * ACK is set on both packets.
1885 * PSH : we do not really care in TCP stack,
1886 * at least for 'GRO' packets.
1888 thtail->fin |= th->fin;
1889 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1891 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1892 TCP_SKB_CB(tail)->has_rxtstamp = true;
1893 tail->tstamp = skb->tstamp;
1894 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1897 /* Not as strict as GRO. We only need to carry mss max value */
1898 shinfo->gso_size = max(gso_size, tail_gso_size);
1899 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1901 sk->sk_backlog.len += delta;
1902 __NET_INC_STATS(sock_net(sk),
1903 LINUX_MIB_TCPBACKLOGCOALESCE);
1904 kfree_skb_partial(skb, fragstolen);
1907 __skb_push(skb, hdrlen);
1910 /* Only socket owner can try to collapse/prune rx queues
1911 * to reduce memory overhead, so add a little headroom here.
1912 * Few sockets backlog are possibly concurrently non empty.
1914 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1916 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1918 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1923 EXPORT_SYMBOL(tcp_add_backlog);
1925 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1927 struct tcphdr *th = (struct tcphdr *)skb->data;
1929 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1931 EXPORT_SYMBOL(tcp_filter);
1933 static void tcp_v4_restore_cb(struct sk_buff *skb)
1935 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1936 sizeof(struct inet_skb_parm));
1939 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1940 const struct tcphdr *th)
1942 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1943 * barrier() makes sure compiler wont play fool^Waliasing games.
1945 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1946 sizeof(struct inet_skb_parm));
1949 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1950 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1951 skb->len - th->doff * 4);
1952 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1953 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1954 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1955 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1956 TCP_SKB_CB(skb)->sacked = 0;
1957 TCP_SKB_CB(skb)->has_rxtstamp =
1958 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1965 int tcp_v4_rcv(struct sk_buff *skb)
1967 struct net *net = dev_net(skb->dev);
1968 int sdif = inet_sdif(skb);
1969 int dif = inet_iif(skb);
1970 const struct iphdr *iph;
1971 const struct tcphdr *th;
1977 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1978 if (skb->pkt_type != PACKET_HOST)
1981 /* Count it even if it's bad */
1982 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1984 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1987 th = (const struct tcphdr *)skb->data;
1989 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1990 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1993 if (!pskb_may_pull(skb, th->doff * 4))
1996 /* An explanation is required here, I think.
1997 * Packet length and doff are validated by header prediction,
1998 * provided case of th->doff==0 is eliminated.
1999 * So, we defer the checks. */
2001 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2004 th = (const struct tcphdr *)skb->data;
2007 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2008 th->dest, sdif, &refcounted);
2013 if (sk->sk_state == TCP_TIME_WAIT)
2016 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2017 struct request_sock *req = inet_reqsk(sk);
2018 bool req_stolen = false;
2021 sk = req->rsk_listener;
2022 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2023 sk_drops_add(sk, skb);
2027 if (tcp_checksum_complete(skb)) {
2031 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2032 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2034 inet_csk_reqsk_queue_drop_and_put(sk, req);
2038 /* reuseport_migrate_sock() has already held one sk_refcnt
2042 /* We own a reference on the listener, increase it again
2043 * as we might lose it too soon.
2049 if (!tcp_filter(sk, skb)) {
2050 th = (const struct tcphdr *)skb->data;
2052 tcp_v4_fill_cb(skb, iph, th);
2053 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2058 /* Another cpu got exclusive access to req
2059 * and created a full blown socket.
2060 * Try to feed this packet to this socket
2061 * instead of discarding it.
2063 tcp_v4_restore_cb(skb);
2067 goto discard_and_relse;
2071 tcp_v4_restore_cb(skb);
2072 } else if (tcp_child_process(sk, nsk, skb)) {
2073 tcp_v4_send_reset(nsk, skb);
2074 goto discard_and_relse;
2081 if (static_branch_unlikely(&ip4_min_ttl)) {
2082 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2083 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2084 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2085 goto discard_and_relse;
2089 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2090 goto discard_and_relse;
2092 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2093 goto discard_and_relse;
2097 if (tcp_filter(sk, skb)) {
2098 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2099 goto discard_and_relse;
2101 th = (const struct tcphdr *)skb->data;
2103 tcp_v4_fill_cb(skb, iph, th);
2107 if (sk->sk_state == TCP_LISTEN) {
2108 ret = tcp_v4_do_rcv(sk, skb);
2109 goto put_and_return;
2112 sk_incoming_cpu_update(sk);
2114 sk_defer_free_flush(sk);
2115 bh_lock_sock_nested(sk);
2116 tcp_segs_in(tcp_sk(sk), skb);
2118 if (!sock_owned_by_user(sk)) {
2119 ret = tcp_v4_do_rcv(sk, skb);
2121 if (tcp_add_backlog(sk, skb))
2122 goto discard_and_relse;
2133 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2134 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2137 tcp_v4_fill_cb(skb, iph, th);
2139 if (tcp_checksum_complete(skb)) {
2141 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2142 trace_tcp_bad_csum(skb);
2143 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2145 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2147 tcp_v4_send_reset(NULL, skb);
2151 /* Discard frame. */
2152 kfree_skb_reason(skb, drop_reason);
2156 sk_drops_add(sk, skb);
2162 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2163 inet_twsk_put(inet_twsk(sk));
2167 tcp_v4_fill_cb(skb, iph, th);
2169 if (tcp_checksum_complete(skb)) {
2170 inet_twsk_put(inet_twsk(sk));
2173 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2175 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2178 iph->saddr, th->source,
2179 iph->daddr, th->dest,
2183 inet_twsk_deschedule_put(inet_twsk(sk));
2185 tcp_v4_restore_cb(skb);
2193 tcp_v4_timewait_ack(sk, skb);
2196 tcp_v4_send_reset(sk, skb);
2197 inet_twsk_deschedule_put(inet_twsk(sk));
2199 case TCP_TW_SUCCESS:;
2204 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2205 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2206 .twsk_unique = tcp_twsk_unique,
2207 .twsk_destructor= tcp_twsk_destructor,
2210 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2212 struct dst_entry *dst = skb_dst(skb);
2214 if (dst && dst_hold_safe(dst)) {
2215 rcu_assign_pointer(sk->sk_rx_dst, dst);
2216 sk->sk_rx_dst_ifindex = skb->skb_iif;
2219 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2221 const struct inet_connection_sock_af_ops ipv4_specific = {
2222 .queue_xmit = ip_queue_xmit,
2223 .send_check = tcp_v4_send_check,
2224 .rebuild_header = inet_sk_rebuild_header,
2225 .sk_rx_dst_set = inet_sk_rx_dst_set,
2226 .conn_request = tcp_v4_conn_request,
2227 .syn_recv_sock = tcp_v4_syn_recv_sock,
2228 .net_header_len = sizeof(struct iphdr),
2229 .setsockopt = ip_setsockopt,
2230 .getsockopt = ip_getsockopt,
2231 .addr2sockaddr = inet_csk_addr2sockaddr,
2232 .sockaddr_len = sizeof(struct sockaddr_in),
2233 .mtu_reduced = tcp_v4_mtu_reduced,
2235 EXPORT_SYMBOL(ipv4_specific);
2237 #ifdef CONFIG_TCP_MD5SIG
2238 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2239 .md5_lookup = tcp_v4_md5_lookup,
2240 .calc_md5_hash = tcp_v4_md5_hash_skb,
2241 .md5_parse = tcp_v4_parse_md5_keys,
2245 /* NOTE: A lot of things set to zero explicitly by call to
2246 * sk_alloc() so need not be done here.
2248 static int tcp_v4_init_sock(struct sock *sk)
2250 struct inet_connection_sock *icsk = inet_csk(sk);
2254 icsk->icsk_af_ops = &ipv4_specific;
2256 #ifdef CONFIG_TCP_MD5SIG
2257 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2263 void tcp_v4_destroy_sock(struct sock *sk)
2265 struct tcp_sock *tp = tcp_sk(sk);
2267 trace_tcp_destroy_sock(sk);
2269 tcp_clear_xmit_timers(sk);
2271 tcp_cleanup_congestion_control(sk);
2273 tcp_cleanup_ulp(sk);
2275 /* Cleanup up the write buffer. */
2276 tcp_write_queue_purge(sk);
2278 /* Check if we want to disable active TFO */
2279 tcp_fastopen_active_disable_ofo_check(sk);
2281 /* Cleans up our, hopefully empty, out_of_order_queue. */
2282 skb_rbtree_purge(&tp->out_of_order_queue);
2284 #ifdef CONFIG_TCP_MD5SIG
2285 /* Clean up the MD5 key list, if any */
2286 if (tp->md5sig_info) {
2287 tcp_clear_md5_list(sk);
2288 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2289 tp->md5sig_info = NULL;
2293 /* Clean up a referenced TCP bind bucket. */
2294 if (inet_csk(sk)->icsk_bind_hash)
2297 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2299 /* If socket is aborted during connect operation */
2300 tcp_free_fastopen_req(tp);
2301 tcp_fastopen_destroy_cipher(sk);
2302 tcp_saved_syn_free(tp);
2304 sk_sockets_allocated_dec(sk);
2306 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2308 #ifdef CONFIG_PROC_FS
2309 /* Proc filesystem TCP sock list dumping. */
2311 static unsigned short seq_file_family(const struct seq_file *seq);
2313 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2315 unsigned short family = seq_file_family(seq);
2317 /* AF_UNSPEC is used as a match all */
2318 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2319 net_eq(sock_net(sk), seq_file_net(seq)));
2322 /* Find a non empty bucket (starting from st->bucket)
2323 * and return the first sk from it.
2325 static void *listening_get_first(struct seq_file *seq)
2327 struct tcp_iter_state *st = seq->private;
2330 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2331 struct inet_listen_hashbucket *ilb2;
2332 struct inet_connection_sock *icsk;
2335 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2336 if (hlist_empty(&ilb2->head))
2339 spin_lock(&ilb2->lock);
2340 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2341 sk = (struct sock *)icsk;
2342 if (seq_sk_match(seq, sk))
2345 spin_unlock(&ilb2->lock);
2351 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2352 * If "cur" is the last one in the st->bucket,
2353 * call listening_get_first() to return the first sk of the next
2356 static void *listening_get_next(struct seq_file *seq, void *cur)
2358 struct tcp_iter_state *st = seq->private;
2359 struct inet_listen_hashbucket *ilb2;
2360 struct inet_connection_sock *icsk;
2361 struct sock *sk = cur;
2366 icsk = inet_csk(sk);
2367 inet_lhash2_for_each_icsk_continue(icsk) {
2368 sk = (struct sock *)icsk;
2369 if (seq_sk_match(seq, sk))
2373 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2374 spin_unlock(&ilb2->lock);
2376 return listening_get_first(seq);
2379 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2381 struct tcp_iter_state *st = seq->private;
2386 rc = listening_get_first(seq);
2388 while (rc && *pos) {
2389 rc = listening_get_next(seq, rc);
2395 static inline bool empty_bucket(const struct tcp_iter_state *st)
2397 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2401 * Get first established socket starting from bucket given in st->bucket.
2402 * If st->bucket is zero, the very first socket in the hash is returned.
2404 static void *established_get_first(struct seq_file *seq)
2406 struct tcp_iter_state *st = seq->private;
2409 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2411 struct hlist_nulls_node *node;
2412 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2414 /* Lockless fast path for the common case of empty buckets */
2415 if (empty_bucket(st))
2419 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2420 if (seq_sk_match(seq, sk))
2423 spin_unlock_bh(lock);
2429 static void *established_get_next(struct seq_file *seq, void *cur)
2431 struct sock *sk = cur;
2432 struct hlist_nulls_node *node;
2433 struct tcp_iter_state *st = seq->private;
2438 sk = sk_nulls_next(sk);
2440 sk_nulls_for_each_from(sk, node) {
2441 if (seq_sk_match(seq, sk))
2445 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2447 return established_get_first(seq);
2450 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2452 struct tcp_iter_state *st = seq->private;
2456 rc = established_get_first(seq);
2459 rc = established_get_next(seq, rc);
2465 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2468 struct tcp_iter_state *st = seq->private;
2470 st->state = TCP_SEQ_STATE_LISTENING;
2471 rc = listening_get_idx(seq, &pos);
2474 st->state = TCP_SEQ_STATE_ESTABLISHED;
2475 rc = established_get_idx(seq, pos);
2481 static void *tcp_seek_last_pos(struct seq_file *seq)
2483 struct tcp_iter_state *st = seq->private;
2484 int bucket = st->bucket;
2485 int offset = st->offset;
2486 int orig_num = st->num;
2489 switch (st->state) {
2490 case TCP_SEQ_STATE_LISTENING:
2491 if (st->bucket > tcp_hashinfo.lhash2_mask)
2493 st->state = TCP_SEQ_STATE_LISTENING;
2494 rc = listening_get_first(seq);
2495 while (offset-- && rc && bucket == st->bucket)
2496 rc = listening_get_next(seq, rc);
2500 st->state = TCP_SEQ_STATE_ESTABLISHED;
2502 case TCP_SEQ_STATE_ESTABLISHED:
2503 if (st->bucket > tcp_hashinfo.ehash_mask)
2505 rc = established_get_first(seq);
2506 while (offset-- && rc && bucket == st->bucket)
2507 rc = established_get_next(seq, rc);
2515 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2517 struct tcp_iter_state *st = seq->private;
2520 if (*pos && *pos == st->last_pos) {
2521 rc = tcp_seek_last_pos(seq);
2526 st->state = TCP_SEQ_STATE_LISTENING;
2530 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2533 st->last_pos = *pos;
2536 EXPORT_SYMBOL(tcp_seq_start);
2538 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2540 struct tcp_iter_state *st = seq->private;
2543 if (v == SEQ_START_TOKEN) {
2544 rc = tcp_get_idx(seq, 0);
2548 switch (st->state) {
2549 case TCP_SEQ_STATE_LISTENING:
2550 rc = listening_get_next(seq, v);
2552 st->state = TCP_SEQ_STATE_ESTABLISHED;
2555 rc = established_get_first(seq);
2558 case TCP_SEQ_STATE_ESTABLISHED:
2559 rc = established_get_next(seq, v);
2564 st->last_pos = *pos;
2567 EXPORT_SYMBOL(tcp_seq_next);
2569 void tcp_seq_stop(struct seq_file *seq, void *v)
2571 struct tcp_iter_state *st = seq->private;
2573 switch (st->state) {
2574 case TCP_SEQ_STATE_LISTENING:
2575 if (v != SEQ_START_TOKEN)
2576 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2578 case TCP_SEQ_STATE_ESTABLISHED:
2580 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2584 EXPORT_SYMBOL(tcp_seq_stop);
2586 static void get_openreq4(const struct request_sock *req,
2587 struct seq_file *f, int i)
2589 const struct inet_request_sock *ireq = inet_rsk(req);
2590 long delta = req->rsk_timer.expires - jiffies;
2592 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2593 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2598 ntohs(ireq->ir_rmt_port),
2600 0, 0, /* could print option size, but that is af dependent. */
2601 1, /* timers active (only the expire timer) */
2602 jiffies_delta_to_clock_t(delta),
2604 from_kuid_munged(seq_user_ns(f),
2605 sock_i_uid(req->rsk_listener)),
2606 0, /* non standard timer */
2607 0, /* open_requests have no inode */
2612 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2615 unsigned long timer_expires;
2616 const struct tcp_sock *tp = tcp_sk(sk);
2617 const struct inet_connection_sock *icsk = inet_csk(sk);
2618 const struct inet_sock *inet = inet_sk(sk);
2619 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2620 __be32 dest = inet->inet_daddr;
2621 __be32 src = inet->inet_rcv_saddr;
2622 __u16 destp = ntohs(inet->inet_dport);
2623 __u16 srcp = ntohs(inet->inet_sport);
2627 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2628 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2629 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2631 timer_expires = icsk->icsk_timeout;
2632 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2634 timer_expires = icsk->icsk_timeout;
2635 } else if (timer_pending(&sk->sk_timer)) {
2637 timer_expires = sk->sk_timer.expires;
2640 timer_expires = jiffies;
2643 state = inet_sk_state_load(sk);
2644 if (state == TCP_LISTEN)
2645 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2647 /* Because we don't lock the socket,
2648 * we might find a transient negative value.
2650 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2651 READ_ONCE(tp->copied_seq), 0);
2653 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2654 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2655 i, src, srcp, dest, destp, state,
2656 READ_ONCE(tp->write_seq) - tp->snd_una,
2659 jiffies_delta_to_clock_t(timer_expires - jiffies),
2660 icsk->icsk_retransmits,
2661 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2662 icsk->icsk_probes_out,
2664 refcount_read(&sk->sk_refcnt), sk,
2665 jiffies_to_clock_t(icsk->icsk_rto),
2666 jiffies_to_clock_t(icsk->icsk_ack.ato),
2667 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2669 state == TCP_LISTEN ?
2670 fastopenq->max_qlen :
2671 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2674 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2675 struct seq_file *f, int i)
2677 long delta = tw->tw_timer.expires - jiffies;
2681 dest = tw->tw_daddr;
2682 src = tw->tw_rcv_saddr;
2683 destp = ntohs(tw->tw_dport);
2684 srcp = ntohs(tw->tw_sport);
2686 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2687 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2688 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2689 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2690 refcount_read(&tw->tw_refcnt), tw);
2695 static int tcp4_seq_show(struct seq_file *seq, void *v)
2697 struct tcp_iter_state *st;
2698 struct sock *sk = v;
2700 seq_setwidth(seq, TMPSZ - 1);
2701 if (v == SEQ_START_TOKEN) {
2702 seq_puts(seq, " sl local_address rem_address st tx_queue "
2703 "rx_queue tr tm->when retrnsmt uid timeout "
2709 if (sk->sk_state == TCP_TIME_WAIT)
2710 get_timewait4_sock(v, seq, st->num);
2711 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2712 get_openreq4(v, seq, st->num);
2714 get_tcp4_sock(v, seq, st->num);
2720 #ifdef CONFIG_BPF_SYSCALL
2721 struct bpf_tcp_iter_state {
2722 struct tcp_iter_state state;
2723 unsigned int cur_sk;
2724 unsigned int end_sk;
2725 unsigned int max_sk;
2726 struct sock **batch;
2727 bool st_bucket_done;
2730 struct bpf_iter__tcp {
2731 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2732 __bpf_md_ptr(struct sock_common *, sk_common);
2733 uid_t uid __aligned(8);
2736 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2737 struct sock_common *sk_common, uid_t uid)
2739 struct bpf_iter__tcp ctx;
2741 meta->seq_num--; /* skip SEQ_START_TOKEN */
2743 ctx.sk_common = sk_common;
2745 return bpf_iter_run_prog(prog, &ctx);
2748 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2750 while (iter->cur_sk < iter->end_sk)
2751 sock_put(iter->batch[iter->cur_sk++]);
2754 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2755 unsigned int new_batch_sz)
2757 struct sock **new_batch;
2759 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2760 GFP_USER | __GFP_NOWARN);
2764 bpf_iter_tcp_put_batch(iter);
2765 kvfree(iter->batch);
2766 iter->batch = new_batch;
2767 iter->max_sk = new_batch_sz;
2772 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2773 struct sock *start_sk)
2775 struct bpf_tcp_iter_state *iter = seq->private;
2776 struct tcp_iter_state *st = &iter->state;
2777 struct inet_connection_sock *icsk;
2778 unsigned int expected = 1;
2781 sock_hold(start_sk);
2782 iter->batch[iter->end_sk++] = start_sk;
2784 icsk = inet_csk(start_sk);
2785 inet_lhash2_for_each_icsk_continue(icsk) {
2786 sk = (struct sock *)icsk;
2787 if (seq_sk_match(seq, sk)) {
2788 if (iter->end_sk < iter->max_sk) {
2790 iter->batch[iter->end_sk++] = sk;
2795 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2800 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2801 struct sock *start_sk)
2803 struct bpf_tcp_iter_state *iter = seq->private;
2804 struct tcp_iter_state *st = &iter->state;
2805 struct hlist_nulls_node *node;
2806 unsigned int expected = 1;
2809 sock_hold(start_sk);
2810 iter->batch[iter->end_sk++] = start_sk;
2812 sk = sk_nulls_next(start_sk);
2813 sk_nulls_for_each_from(sk, node) {
2814 if (seq_sk_match(seq, sk)) {
2815 if (iter->end_sk < iter->max_sk) {
2817 iter->batch[iter->end_sk++] = sk;
2822 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2827 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2829 struct bpf_tcp_iter_state *iter = seq->private;
2830 struct tcp_iter_state *st = &iter->state;
2831 unsigned int expected;
2832 bool resized = false;
2835 /* The st->bucket is done. Directly advance to the next
2836 * bucket instead of having the tcp_seek_last_pos() to skip
2837 * one by one in the current bucket and eventually find out
2838 * it has to advance to the next bucket.
2840 if (iter->st_bucket_done) {
2843 if (st->state == TCP_SEQ_STATE_LISTENING &&
2844 st->bucket > tcp_hashinfo.lhash2_mask) {
2845 st->state = TCP_SEQ_STATE_ESTABLISHED;
2851 /* Get a new batch */
2854 iter->st_bucket_done = false;
2856 sk = tcp_seek_last_pos(seq);
2858 return NULL; /* Done */
2860 if (st->state == TCP_SEQ_STATE_LISTENING)
2861 expected = bpf_iter_tcp_listening_batch(seq, sk);
2863 expected = bpf_iter_tcp_established_batch(seq, sk);
2865 if (iter->end_sk == expected) {
2866 iter->st_bucket_done = true;
2870 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2878 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2880 /* bpf iter does not support lseek, so it always
2881 * continue from where it was stop()-ped.
2884 return bpf_iter_tcp_batch(seq);
2886 return SEQ_START_TOKEN;
2889 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2891 struct bpf_tcp_iter_state *iter = seq->private;
2892 struct tcp_iter_state *st = &iter->state;
2895 /* Whenever seq_next() is called, the iter->cur_sk is
2896 * done with seq_show(), so advance to the next sk in
2899 if (iter->cur_sk < iter->end_sk) {
2900 /* Keeping st->num consistent in tcp_iter_state.
2901 * bpf_iter_tcp does not use st->num.
2902 * meta.seq_num is used instead.
2905 /* Move st->offset to the next sk in the bucket such that
2906 * the future start() will resume at st->offset in
2907 * st->bucket. See tcp_seek_last_pos().
2910 sock_put(iter->batch[iter->cur_sk++]);
2913 if (iter->cur_sk < iter->end_sk)
2914 sk = iter->batch[iter->cur_sk];
2916 sk = bpf_iter_tcp_batch(seq);
2919 /* Keeping st->last_pos consistent in tcp_iter_state.
2920 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2922 st->last_pos = *pos;
2926 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2928 struct bpf_iter_meta meta;
2929 struct bpf_prog *prog;
2930 struct sock *sk = v;
2935 if (v == SEQ_START_TOKEN)
2938 if (sk_fullsock(sk))
2939 slow = lock_sock_fast(sk);
2941 if (unlikely(sk_unhashed(sk))) {
2946 if (sk->sk_state == TCP_TIME_WAIT) {
2948 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2949 const struct request_sock *req = v;
2951 uid = from_kuid_munged(seq_user_ns(seq),
2952 sock_i_uid(req->rsk_listener));
2954 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2958 prog = bpf_iter_get_info(&meta, false);
2959 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2962 if (sk_fullsock(sk))
2963 unlock_sock_fast(sk, slow);
2968 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2970 struct bpf_tcp_iter_state *iter = seq->private;
2971 struct bpf_iter_meta meta;
2972 struct bpf_prog *prog;
2976 prog = bpf_iter_get_info(&meta, true);
2978 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2981 if (iter->cur_sk < iter->end_sk) {
2982 bpf_iter_tcp_put_batch(iter);
2983 iter->st_bucket_done = false;
2987 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2988 .show = bpf_iter_tcp_seq_show,
2989 .start = bpf_iter_tcp_seq_start,
2990 .next = bpf_iter_tcp_seq_next,
2991 .stop = bpf_iter_tcp_seq_stop,
2994 static unsigned short seq_file_family(const struct seq_file *seq)
2996 const struct tcp_seq_afinfo *afinfo;
2998 #ifdef CONFIG_BPF_SYSCALL
2999 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3000 if (seq->op == &bpf_iter_tcp_seq_ops)
3004 /* Iterated from proc fs */
3005 afinfo = pde_data(file_inode(seq->file));
3006 return afinfo->family;
3009 static const struct seq_operations tcp4_seq_ops = {
3010 .show = tcp4_seq_show,
3011 .start = tcp_seq_start,
3012 .next = tcp_seq_next,
3013 .stop = tcp_seq_stop,
3016 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3020 static int __net_init tcp4_proc_init_net(struct net *net)
3022 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3023 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3028 static void __net_exit tcp4_proc_exit_net(struct net *net)
3030 remove_proc_entry("tcp", net->proc_net);
3033 static struct pernet_operations tcp4_net_ops = {
3034 .init = tcp4_proc_init_net,
3035 .exit = tcp4_proc_exit_net,
3038 int __init tcp4_proc_init(void)
3040 return register_pernet_subsys(&tcp4_net_ops);
3043 void tcp4_proc_exit(void)
3045 unregister_pernet_subsys(&tcp4_net_ops);
3047 #endif /* CONFIG_PROC_FS */
3049 /* @wake is one when sk_stream_write_space() calls us.
3050 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3051 * This mimics the strategy used in sock_def_write_space().
3053 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3055 const struct tcp_sock *tp = tcp_sk(sk);
3056 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3057 READ_ONCE(tp->snd_nxt);
3059 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3061 EXPORT_SYMBOL(tcp_stream_memory_free);
3063 struct proto tcp_prot = {
3065 .owner = THIS_MODULE,
3067 .pre_connect = tcp_v4_pre_connect,
3068 .connect = tcp_v4_connect,
3069 .disconnect = tcp_disconnect,
3070 .accept = inet_csk_accept,
3072 .init = tcp_v4_init_sock,
3073 .destroy = tcp_v4_destroy_sock,
3074 .shutdown = tcp_shutdown,
3075 .setsockopt = tcp_setsockopt,
3076 .getsockopt = tcp_getsockopt,
3077 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3078 .keepalive = tcp_set_keepalive,
3079 .recvmsg = tcp_recvmsg,
3080 .sendmsg = tcp_sendmsg,
3081 .sendpage = tcp_sendpage,
3082 .backlog_rcv = tcp_v4_do_rcv,
3083 .release_cb = tcp_release_cb,
3085 .unhash = inet_unhash,
3086 .get_port = inet_csk_get_port,
3087 .put_port = inet_put_port,
3088 #ifdef CONFIG_BPF_SYSCALL
3089 .psock_update_sk_prot = tcp_bpf_update_proto,
3091 .enter_memory_pressure = tcp_enter_memory_pressure,
3092 .leave_memory_pressure = tcp_leave_memory_pressure,
3093 .stream_memory_free = tcp_stream_memory_free,
3094 .sockets_allocated = &tcp_sockets_allocated,
3095 .orphan_count = &tcp_orphan_count,
3096 .memory_allocated = &tcp_memory_allocated,
3097 .memory_pressure = &tcp_memory_pressure,
3098 .sysctl_mem = sysctl_tcp_mem,
3099 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3100 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3101 .max_header = MAX_TCP_HEADER,
3102 .obj_size = sizeof(struct tcp_sock),
3103 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3104 .twsk_prot = &tcp_timewait_sock_ops,
3105 .rsk_prot = &tcp_request_sock_ops,
3106 .h.hashinfo = &tcp_hashinfo,
3107 .no_autobind = true,
3108 .diag_destroy = tcp_abort,
3110 EXPORT_SYMBOL(tcp_prot);
3112 static void __net_exit tcp_sk_exit(struct net *net)
3116 if (net->ipv4.tcp_congestion_control)
3117 bpf_module_put(net->ipv4.tcp_congestion_control,
3118 net->ipv4.tcp_congestion_control->owner);
3120 for_each_possible_cpu(cpu)
3121 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3122 free_percpu(net->ipv4.tcp_sk);
3125 static int __net_init tcp_sk_init(struct net *net)
3129 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3130 if (!net->ipv4.tcp_sk)
3133 for_each_possible_cpu(cpu) {
3136 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3140 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3142 /* Please enforce IP_DF and IPID==0 for RST and
3143 * ACK sent in SYN-RECV and TIME-WAIT state.
3145 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3147 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3150 net->ipv4.sysctl_tcp_ecn = 2;
3151 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3153 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3154 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3155 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3156 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3157 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3159 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3160 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3161 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3163 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3164 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3165 net->ipv4.sysctl_tcp_syncookies = 1;
3166 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3167 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3168 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3169 net->ipv4.sysctl_tcp_orphan_retries = 0;
3170 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3171 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3172 net->ipv4.sysctl_tcp_tw_reuse = 2;
3173 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3175 cnt = tcp_hashinfo.ehash_mask + 1;
3176 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3177 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3179 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3180 net->ipv4.sysctl_tcp_sack = 1;
3181 net->ipv4.sysctl_tcp_window_scaling = 1;
3182 net->ipv4.sysctl_tcp_timestamps = 1;
3183 net->ipv4.sysctl_tcp_early_retrans = 3;
3184 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3185 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3186 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3187 net->ipv4.sysctl_tcp_max_reordering = 300;
3188 net->ipv4.sysctl_tcp_dsack = 1;
3189 net->ipv4.sysctl_tcp_app_win = 31;
3190 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3191 net->ipv4.sysctl_tcp_frto = 2;
3192 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3193 /* This limits the percentage of the congestion window which we
3194 * will allow a single TSO frame to consume. Building TSO frames
3195 * which are too large can cause TCP streams to be bursty.
3197 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3198 /* Default TSQ limit of 16 TSO segments */
3199 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3200 /* rfc5961 challenge ack rate limiting */
3201 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3202 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3203 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3204 net->ipv4.sysctl_tcp_autocorking = 1;
3205 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3206 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3207 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3208 if (net != &init_net) {
3209 memcpy(net->ipv4.sysctl_tcp_rmem,
3210 init_net.ipv4.sysctl_tcp_rmem,
3211 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3212 memcpy(net->ipv4.sysctl_tcp_wmem,
3213 init_net.ipv4.sysctl_tcp_wmem,
3214 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3216 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3217 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3218 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3219 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3220 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3221 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3223 /* Reno is always built in */
3224 if (!net_eq(net, &init_net) &&
3225 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3226 init_net.ipv4.tcp_congestion_control->owner))
3227 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3229 net->ipv4.tcp_congestion_control = &tcp_reno;
3238 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3242 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3244 list_for_each_entry(net, net_exit_list, exit_list)
3245 tcp_fastopen_ctx_destroy(net);
3248 static struct pernet_operations __net_initdata tcp_sk_ops = {
3249 .init = tcp_sk_init,
3250 .exit = tcp_sk_exit,
3251 .exit_batch = tcp_sk_exit_batch,
3254 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3255 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3256 struct sock_common *sk_common, uid_t uid)
3258 #define INIT_BATCH_SZ 16
3260 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3262 struct bpf_tcp_iter_state *iter = priv_data;
3265 err = bpf_iter_init_seq_net(priv_data, aux);
3269 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3271 bpf_iter_fini_seq_net(priv_data);
3278 static void bpf_iter_fini_tcp(void *priv_data)
3280 struct bpf_tcp_iter_state *iter = priv_data;
3282 bpf_iter_fini_seq_net(priv_data);
3283 kvfree(iter->batch);
3286 static const struct bpf_iter_seq_info tcp_seq_info = {
3287 .seq_ops = &bpf_iter_tcp_seq_ops,
3288 .init_seq_private = bpf_iter_init_tcp,
3289 .fini_seq_private = bpf_iter_fini_tcp,
3290 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3293 static const struct bpf_func_proto *
3294 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3295 const struct bpf_prog *prog)
3298 case BPF_FUNC_setsockopt:
3299 return &bpf_sk_setsockopt_proto;
3300 case BPF_FUNC_getsockopt:
3301 return &bpf_sk_getsockopt_proto;
3307 static struct bpf_iter_reg tcp_reg_info = {
3309 .ctx_arg_info_size = 1,
3311 { offsetof(struct bpf_iter__tcp, sk_common),
3312 PTR_TO_BTF_ID_OR_NULL },
3314 .get_func_proto = bpf_iter_tcp_get_func_proto,
3315 .seq_info = &tcp_seq_info,
3318 static void __init bpf_iter_register(void)
3320 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3321 if (bpf_iter_reg_target(&tcp_reg_info))
3322 pr_warn("Warning: could not register bpf iterator tcp\n");
3327 void __init tcp_v4_init(void)
3329 if (register_pernet_subsys(&tcp_sk_ops))
3330 panic("Failed to create the TCP control socket.\n");
3332 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3333 bpf_iter_register();