1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
83 #include <trace/events/tcp.h>
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
98 tcp_hdr(skb)->source);
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 if (ipv4_is_loopback(tw->tw_daddr) ||
132 ipv4_is_loopback(tw->tw_rcv_saddr))
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
150 if (tcptw->tw_ts_recent_stamp &&
151 (!twp || (reuse && time_after32(ktime_get_seconds(),
152 tcptw->tw_ts_recent_stamp)))) {
153 /* In case of repair and re-using TIME-WAIT sockets we still
154 * want to be sure that it is safe as above but honor the
155 * sequence numbers and time stamps set as part of the repair
158 * Without this check re-using a TIME-WAIT socket with TCP
159 * repair would accumulate a -1 on the repair assigned
160 * sequence number. The first time it is reused the sequence
161 * is -1, the second time -2, etc. This fixes that issue
162 * without appearing to create any others.
164 if (likely(!tp->repair)) {
165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 WRITE_ONCE(tp->write_seq, seq);
170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
188 if (addr_len < sizeof(struct sockaddr_in))
191 sock_owned_by_me(sk);
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 struct inet_sock *inet = inet_sk(sk);
201 struct tcp_sock *tp = tcp_sk(sk);
202 __be16 orig_sport, orig_dport;
203 __be32 daddr, nexthop;
207 struct ip_options_rcu *inet_opt;
208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 if (addr_len < sizeof(struct sockaddr_in))
213 if (usin->sin_family != AF_INET)
214 return -EAFNOSUPPORT;
216 nexthop = daddr = usin->sin_addr.s_addr;
217 inet_opt = rcu_dereference_protected(inet->inet_opt,
218 lockdep_sock_is_held(sk));
219 if (inet_opt && inet_opt->opt.srr) {
222 nexthop = inet_opt->opt.faddr;
225 orig_sport = inet->inet_sport;
226 orig_dport = usin->sin_port;
227 fl4 = &inet->cork.fl.u.ip4;
228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 orig_sport, orig_dport, sk);
234 if (err == -ENETUNREACH)
235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 if (!inet_opt || !inet_opt->opt.srr)
247 if (!inet->inet_saddr)
248 inet->inet_saddr = fl4->saddr;
249 sk_rcv_saddr_set(sk, inet->inet_saddr);
251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 /* Reset inherited state */
253 tp->rx_opt.ts_recent = 0;
254 tp->rx_opt.ts_recent_stamp = 0;
255 if (likely(!tp->repair))
256 WRITE_ONCE(tp->write_seq, 0);
259 inet->inet_dport = usin->sin_port;
260 sk_daddr_set(sk, daddr);
262 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
273 tcp_set_state(sk, TCP_SYN_SENT);
274 err = inet_hash_connect(tcp_death_row, sk);
280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 inet->inet_sport, inet->inet_dport, sk);
287 /* OK, now commit destination to socket. */
288 sk->sk_gso_type = SKB_GSO_TCPV4;
289 sk_setup_caps(sk, &rt->dst);
292 if (likely(!tp->repair)) {
294 WRITE_ONCE(tp->write_seq,
295 secure_tcp_seq(inet->inet_saddr,
299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304 inet->inet_id = prandom_u32();
306 if (tcp_fastopen_defer_connect(sk, &err))
311 err = tcp_connect(sk);
320 * This unhashes the socket and releases the local port,
323 tcp_set_state(sk, TCP_CLOSE);
325 sk->sk_route_caps = 0;
326 inet->inet_dport = 0;
329 EXPORT_SYMBOL(tcp_v4_connect);
332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333 * It can be called through tcp_release_cb() if socket was owned by user
334 * at the time tcp_v4_err() was called to handle ICMP message.
336 void tcp_v4_mtu_reduced(struct sock *sk)
338 struct inet_sock *inet = inet_sk(sk);
339 struct dst_entry *dst;
342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 mtu = tcp_sk(sk)->mtu_info;
345 dst = inet_csk_update_pmtu(sk, mtu);
349 /* Something is about to be wrong... Remember soft error
350 * for the case, if this connection will not able to recover.
352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 sk->sk_err_soft = EMSGSIZE;
357 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 ip_sk_accept_pmtu(sk) &&
359 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 tcp_sync_mss(sk, mtu);
362 /* Resend the TCP packet because it's
363 * clear that the old packet has been
364 * dropped. This is the new "fast" path mtu
367 tcp_simple_retransmit(sk);
368 } /* else let the usual retransmit timer handle it */
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 struct dst_entry *dst = __sk_dst_check(sk, 0);
377 dst->ops->redirect(dst, sk, skb);
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 struct request_sock *req = inet_reqsk(sk);
385 struct net *net = sock_net(sk);
387 /* ICMPs are not backlogged, hence we cannot get
388 * an established socket here.
390 if (seq != tcp_rsk(req)->snt_isn) {
391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 * Still in SYN_RECV, just remove it silently.
395 * There is no good way to pass the error to the newly
396 * created socket, and POSIX does not want network
397 * errors returned from accept().
399 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 tcp_listendrop(req->rsk_listener);
404 EXPORT_SYMBOL(tcp_req_err);
406 /* TCP-LD (RFC 6069) logic */
407 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 struct inet_connection_sock *icsk = inet_csk(sk);
410 struct tcp_sock *tp = tcp_sk(sk);
415 if (sock_owned_by_user(sk))
418 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
422 skb = tcp_rtx_queue_head(sk);
423 if (WARN_ON_ONCE(!skb))
426 icsk->icsk_backoff--;
427 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
428 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 tcp_mstamp_refresh(tp);
431 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
432 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
435 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 remaining, TCP_RTO_MAX);
438 /* RTO revert clocked out retransmission.
439 * Will retransmit now.
441 tcp_retransmit_timer(sk);
444 EXPORT_SYMBOL(tcp_ld_RTO_revert);
447 * This routine is called by the ICMP module when it gets some
448 * sort of error condition. If err < 0 then the socket should
449 * be closed and the error returned to the user. If err > 0
450 * it's just the icmp type << 8 | icmp code. After adjustment
451 * header points to the first 8 bytes of the tcp header. We need
452 * to find the appropriate port.
454 * The locking strategy used here is very "optimistic". When
455 * someone else accesses the socket the ICMP is just dropped
456 * and for some paths there is no check at all.
457 * A more general error queue to queue errors for later handling
458 * is probably better.
462 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 const struct iphdr *iph = (const struct iphdr *)skb->data;
465 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 struct inet_sock *inet;
468 const int type = icmp_hdr(skb)->type;
469 const int code = icmp_hdr(skb)->code;
471 struct request_sock *fastopen;
474 struct net *net = dev_net(skb->dev);
476 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
477 th->dest, iph->saddr, ntohs(th->source),
480 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
483 if (sk->sk_state == TCP_TIME_WAIT) {
484 inet_twsk_put(inet_twsk(sk));
487 seq = ntohl(th->seq);
488 if (sk->sk_state == TCP_NEW_SYN_RECV) {
489 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
490 type == ICMP_TIME_EXCEEDED ||
491 (type == ICMP_DEST_UNREACH &&
492 (code == ICMP_NET_UNREACH ||
493 code == ICMP_HOST_UNREACH)));
498 /* If too many ICMPs get dropped on busy
499 * servers this needs to be solved differently.
500 * We do take care of PMTU discovery (RFC1191) special case :
501 * we can receive locally generated ICMP messages while socket is held.
503 if (sock_owned_by_user(sk)) {
504 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
505 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 if (sk->sk_state == TCP_CLOSE)
510 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
511 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
516 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
517 fastopen = rcu_dereference(tp->fastopen_rsk);
518 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
519 if (sk->sk_state != TCP_LISTEN &&
520 !between(seq, snd_una, tp->snd_nxt)) {
521 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 if (!sock_owned_by_user(sk))
528 do_redirect(skb, sk);
530 case ICMP_SOURCE_QUENCH:
531 /* Just silently ignore these. */
533 case ICMP_PARAMETERPROB:
536 case ICMP_DEST_UNREACH:
537 if (code > NR_ICMP_UNREACH)
540 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
541 /* We are not interested in TCP_LISTEN and open_requests
542 * (SYN-ACKs send out by Linux are always <576bytes so
543 * they should go through unfragmented).
545 if (sk->sk_state == TCP_LISTEN)
549 if (!sock_owned_by_user(sk)) {
550 tcp_v4_mtu_reduced(sk);
552 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 err = icmp_err_convert[code].errno;
559 /* check if this ICMP message allows revert of backoff.
563 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
564 tcp_ld_RTO_revert(sk, seq);
566 case ICMP_TIME_EXCEEDED:
573 switch (sk->sk_state) {
576 /* Only in fast or simultaneous open. If a fast open socket is
577 * is already accepted it is treated as a connected one below.
579 if (fastopen && !fastopen->sk)
582 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 if (!sock_owned_by_user(sk)) {
587 sk->sk_error_report(sk);
591 sk->sk_err_soft = err;
596 /* If we've already connected we will keep trying
597 * until we time out, or the user gives up.
599 * rfc1122 4.2.3.9 allows to consider as hard errors
600 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
601 * but it is obsoleted by pmtu discovery).
603 * Note, that in modern internet, where routing is unreliable
604 * and in each dark corner broken firewalls sit, sending random
605 * errors ordered by their masters even this two messages finally lose
606 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 * Now we are in compliance with RFCs.
613 if (!sock_owned_by_user(sk) && inet->recverr) {
615 sk->sk_error_report(sk);
616 } else { /* Only an error on timeout */
617 sk->sk_err_soft = err;
626 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 struct tcphdr *th = tcp_hdr(skb);
630 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
631 skb->csum_start = skb_transport_header(skb) - skb->head;
632 skb->csum_offset = offsetof(struct tcphdr, check);
635 /* This routine computes an IPv4 TCP checksum. */
636 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 const struct inet_sock *inet = inet_sk(sk);
640 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 EXPORT_SYMBOL(tcp_v4_send_check);
645 * This routine will send an RST to the other tcp.
647 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649 * Answer: if a packet caused RST, it is not for a socket
650 * existing in our system, if it is matched to a socket,
651 * it is just duplicate segment or bug in other side's TCP.
652 * So that we build reply only basing on parameters
653 * arrived with segment.
654 * Exception: precedence violation. We do not implement it in any case.
657 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 const struct tcphdr *th = tcp_hdr(skb);
662 #ifdef CONFIG_TCP_MD5SIG
663 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
666 struct ip_reply_arg arg;
667 #ifdef CONFIG_TCP_MD5SIG
668 struct tcp_md5sig_key *key = NULL;
669 const __u8 *hash_location = NULL;
670 unsigned char newhash[16];
672 struct sock *sk1 = NULL;
674 u64 transmit_time = 0;
678 /* Never send a reset in response to a reset. */
682 /* If sk not NULL, it means we did a successful lookup and incoming
683 * route had to be correct. prequeue might have dropped our dst.
685 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
688 /* Swap the send and the receive. */
689 memset(&rep, 0, sizeof(rep));
690 rep.th.dest = th->source;
691 rep.th.source = th->dest;
692 rep.th.doff = sizeof(struct tcphdr) / 4;
696 rep.th.seq = th->ack_seq;
699 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
700 skb->len - (th->doff << 2));
703 memset(&arg, 0, sizeof(arg));
704 arg.iov[0].iov_base = (unsigned char *)&rep;
705 arg.iov[0].iov_len = sizeof(rep.th);
707 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
708 #ifdef CONFIG_TCP_MD5SIG
710 hash_location = tcp_parse_md5sig_option(th);
711 if (sk && sk_fullsock(sk)) {
712 const union tcp_md5_addr *addr;
715 /* sdif set, means packet ingressed via a device
716 * in an L3 domain and inet_iif is set to it.
718 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
719 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
720 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
721 } else if (hash_location) {
722 const union tcp_md5_addr *addr;
723 int sdif = tcp_v4_sdif(skb);
724 int dif = inet_iif(skb);
728 * active side is lost. Try to find listening socket through
729 * source port, and then find md5 key through listening socket.
730 * we are not loose security here:
731 * Incoming packet is checked with md5 hash with finding key,
732 * no RST generated if md5 hash doesn't match.
734 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 th->source, ip_hdr(skb)->daddr,
737 ntohs(th->source), dif, sdif);
738 /* don't send rst if it can't find key */
742 /* sdif set, means packet ingressed via a device
743 * in an L3 domain and dif is set to it.
745 l3index = sdif ? dif : 0;
746 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
747 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
752 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
753 if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 (TCPOPT_MD5SIG << 8) |
763 /* Update length and the length the header thinks exists */
764 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
765 rep.th.doff = arg.iov[0].iov_len / 4;
767 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
768 key, ip_hdr(skb)->saddr,
769 ip_hdr(skb)->daddr, &rep.th);
772 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 ip_hdr(skb)->saddr, /* XXX */
774 arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 /* When socket is gone, all binding information is lost.
779 * routing might fail in this case. No choice here, if we choose to force
780 * input interface, we will misroute in case of asymmetric route.
783 arg.bound_dev_if = sk->sk_bound_dev_if;
785 trace_tcp_send_reset(sk, skb);
788 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
789 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 arg.tos = ip_hdr(skb)->tos;
792 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
797 inet_twsk(sk)->tw_mark : sk->sk_mark;
798 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
799 inet_twsk(sk)->tw_priority : sk->sk_priority;
800 transmit_time = tcp_transmit_time(sk);
802 ip_send_unicast_reply(ctl_sk,
803 skb, &TCP_SKB_CB(skb)->header.h4.opt,
804 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
805 &arg, arg.iov[0].iov_len,
809 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
810 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
813 #ifdef CONFIG_TCP_MD5SIG
819 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
820 outside socket context is ugly, certainly. What can I do?
823 static void tcp_v4_send_ack(const struct sock *sk,
824 struct sk_buff *skb, u32 seq, u32 ack,
825 u32 win, u32 tsval, u32 tsecr, int oif,
826 struct tcp_md5sig_key *key,
827 int reply_flags, u8 tos)
829 const struct tcphdr *th = tcp_hdr(skb);
832 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
833 #ifdef CONFIG_TCP_MD5SIG
834 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
838 struct net *net = sock_net(sk);
839 struct ip_reply_arg arg;
843 memset(&rep.th, 0, sizeof(struct tcphdr));
844 memset(&arg, 0, sizeof(arg));
846 arg.iov[0].iov_base = (unsigned char *)&rep;
847 arg.iov[0].iov_len = sizeof(rep.th);
849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
850 (TCPOPT_TIMESTAMP << 8) |
852 rep.opt[1] = htonl(tsval);
853 rep.opt[2] = htonl(tsecr);
854 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
857 /* Swap the send and the receive. */
858 rep.th.dest = th->source;
859 rep.th.source = th->dest;
860 rep.th.doff = arg.iov[0].iov_len / 4;
861 rep.th.seq = htonl(seq);
862 rep.th.ack_seq = htonl(ack);
864 rep.th.window = htons(win);
866 #ifdef CONFIG_TCP_MD5SIG
868 int offset = (tsecr) ? 3 : 0;
870 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 (TCPOPT_MD5SIG << 8) |
874 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
875 rep.th.doff = arg.iov[0].iov_len/4;
877 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
878 key, ip_hdr(skb)->saddr,
879 ip_hdr(skb)->daddr, &rep.th);
882 arg.flags = reply_flags;
883 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
884 ip_hdr(skb)->saddr, /* XXX */
885 arg.iov[0].iov_len, IPPROTO_TCP, 0);
886 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 arg.bound_dev_if = oif;
890 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
893 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
894 inet_twsk(sk)->tw_mark : sk->sk_mark;
895 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
896 inet_twsk(sk)->tw_priority : sk->sk_priority;
897 transmit_time = tcp_transmit_time(sk);
898 ip_send_unicast_reply(ctl_sk,
899 skb, &TCP_SKB_CB(skb)->header.h4.opt,
900 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
901 &arg, arg.iov[0].iov_len,
905 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
909 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 struct inet_timewait_sock *tw = inet_twsk(sk);
912 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 tcp_v4_send_ack(sk, skb,
915 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
916 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
917 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
920 tcp_twsk_md5_key(tcptw),
921 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
928 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
929 struct request_sock *req)
931 const union tcp_md5_addr *addr;
934 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
935 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
941 * The window field (SEG.WND) of every outgoing segment, with the
942 * exception of <SYN> segments, MUST be right-shifted by
943 * Rcv.Wind.Shift bits:
945 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
946 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
947 tcp_v4_send_ack(sk, skb, seq,
948 tcp_rsk(req)->rcv_nxt,
949 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
950 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
953 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
954 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
959 * Send a SYN-ACK after having received a SYN.
960 * This still operates on a request_sock only, not on a big
963 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 struct request_sock *req,
966 struct tcp_fastopen_cookie *foc,
967 enum tcp_synack_type synack_type)
969 const struct inet_request_sock *ireq = inet_rsk(req);
974 /* First, grab a route. */
975 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
978 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
981 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
984 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
986 rcu_dereference(ireq->ireq_opt));
988 err = net_xmit_eval(err);
995 * IPv4 request_sock destructor.
997 static void tcp_v4_reqsk_destructor(struct request_sock *req)
999 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1002 #ifdef CONFIG_TCP_MD5SIG
1004 * RFC2385 MD5 checksumming requires a mapping of
1005 * IP address->MD5 Key.
1006 * We need to maintain these in the sk structure.
1009 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1010 EXPORT_SYMBOL(tcp_md5_needed);
1012 /* Find the Key structure for an address. */
1013 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1014 const union tcp_md5_addr *addr,
1017 const struct tcp_sock *tp = tcp_sk(sk);
1018 struct tcp_md5sig_key *key;
1019 const struct tcp_md5sig_info *md5sig;
1021 struct tcp_md5sig_key *best_match = NULL;
1024 /* caller either holds rcu_read_lock() or socket lock */
1025 md5sig = rcu_dereference_check(tp->md5sig_info,
1026 lockdep_sock_is_held(sk));
1030 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1031 lockdep_sock_is_held(sk)) {
1032 if (key->family != family)
1034 if (key->l3index && key->l3index != l3index)
1036 if (family == AF_INET) {
1037 mask = inet_make_mask(key->prefixlen);
1038 match = (key->addr.a4.s_addr & mask) ==
1039 (addr->a4.s_addr & mask);
1040 #if IS_ENABLED(CONFIG_IPV6)
1041 } else if (family == AF_INET6) {
1042 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1049 if (match && (!best_match ||
1050 key->prefixlen > best_match->prefixlen))
1055 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1057 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1058 const union tcp_md5_addr *addr,
1059 int family, u8 prefixlen,
1062 const struct tcp_sock *tp = tcp_sk(sk);
1063 struct tcp_md5sig_key *key;
1064 unsigned int size = sizeof(struct in_addr);
1065 const struct tcp_md5sig_info *md5sig;
1067 /* caller either holds rcu_read_lock() or socket lock */
1068 md5sig = rcu_dereference_check(tp->md5sig_info,
1069 lockdep_sock_is_held(sk));
1072 #if IS_ENABLED(CONFIG_IPV6)
1073 if (family == AF_INET6)
1074 size = sizeof(struct in6_addr);
1076 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1077 lockdep_sock_is_held(sk)) {
1078 if (key->family != family)
1080 if (key->l3index && key->l3index != l3index)
1082 if (!memcmp(&key->addr, addr, size) &&
1083 key->prefixlen == prefixlen)
1089 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1090 const struct sock *addr_sk)
1092 const union tcp_md5_addr *addr;
1095 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1096 addr_sk->sk_bound_dev_if);
1097 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1098 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1100 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1102 /* This can be called on a newly created socket, from other files */
1103 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1104 int family, u8 prefixlen, int l3index,
1105 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1107 /* Add Key to the list */
1108 struct tcp_md5sig_key *key;
1109 struct tcp_sock *tp = tcp_sk(sk);
1110 struct tcp_md5sig_info *md5sig;
1112 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1114 /* Pre-existing entry - just update that one. */
1115 memcpy(key->key, newkey, newkeylen);
1116 key->keylen = newkeylen;
1120 md5sig = rcu_dereference_protected(tp->md5sig_info,
1121 lockdep_sock_is_held(sk));
1123 md5sig = kmalloc(sizeof(*md5sig), gfp);
1127 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1128 INIT_HLIST_HEAD(&md5sig->head);
1129 rcu_assign_pointer(tp->md5sig_info, md5sig);
1132 key = sock_kmalloc(sk, sizeof(*key), gfp);
1135 if (!tcp_alloc_md5sig_pool()) {
1136 sock_kfree_s(sk, key, sizeof(*key));
1140 memcpy(key->key, newkey, newkeylen);
1141 key->keylen = newkeylen;
1142 key->family = family;
1143 key->prefixlen = prefixlen;
1144 key->l3index = l3index;
1145 memcpy(&key->addr, addr,
1146 (family == AF_INET6) ? sizeof(struct in6_addr) :
1147 sizeof(struct in_addr));
1148 hlist_add_head_rcu(&key->node, &md5sig->head);
1151 EXPORT_SYMBOL(tcp_md5_do_add);
1153 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1154 u8 prefixlen, int l3index)
1156 struct tcp_md5sig_key *key;
1158 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1161 hlist_del_rcu(&key->node);
1162 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1163 kfree_rcu(key, rcu);
1166 EXPORT_SYMBOL(tcp_md5_do_del);
1168 static void tcp_clear_md5_list(struct sock *sk)
1170 struct tcp_sock *tp = tcp_sk(sk);
1171 struct tcp_md5sig_key *key;
1172 struct hlist_node *n;
1173 struct tcp_md5sig_info *md5sig;
1175 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1177 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1178 hlist_del_rcu(&key->node);
1179 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1180 kfree_rcu(key, rcu);
1184 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1185 char __user *optval, int optlen)
1187 struct tcp_md5sig cmd;
1188 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1189 const union tcp_md5_addr *addr;
1193 if (optlen < sizeof(cmd))
1196 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1199 if (sin->sin_family != AF_INET)
1202 if (optname == TCP_MD5SIG_EXT &&
1203 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1204 prefixlen = cmd.tcpm_prefixlen;
1209 if (optname == TCP_MD5SIG_EXT &&
1210 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1211 struct net_device *dev;
1214 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1215 if (dev && netif_is_l3_master(dev))
1216 l3index = dev->ifindex;
1220 /* ok to reference set/not set outside of rcu;
1221 * right now device MUST be an L3 master
1223 if (!dev || !l3index)
1227 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1229 if (!cmd.tcpm_keylen)
1230 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1232 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1235 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1236 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1239 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1240 __be32 daddr, __be32 saddr,
1241 const struct tcphdr *th, int nbytes)
1243 struct tcp4_pseudohdr *bp;
1244 struct scatterlist sg;
1251 bp->protocol = IPPROTO_TCP;
1252 bp->len = cpu_to_be16(nbytes);
1254 _th = (struct tcphdr *)(bp + 1);
1255 memcpy(_th, th, sizeof(*th));
1258 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1259 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1260 sizeof(*bp) + sizeof(*th));
1261 return crypto_ahash_update(hp->md5_req);
1264 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1265 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1267 struct tcp_md5sig_pool *hp;
1268 struct ahash_request *req;
1270 hp = tcp_get_md5sig_pool();
1272 goto clear_hash_noput;
1275 if (crypto_ahash_init(req))
1277 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1279 if (tcp_md5_hash_key(hp, key))
1281 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1282 if (crypto_ahash_final(req))
1285 tcp_put_md5sig_pool();
1289 tcp_put_md5sig_pool();
1291 memset(md5_hash, 0, 16);
1295 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1296 const struct sock *sk,
1297 const struct sk_buff *skb)
1299 struct tcp_md5sig_pool *hp;
1300 struct ahash_request *req;
1301 const struct tcphdr *th = tcp_hdr(skb);
1302 __be32 saddr, daddr;
1304 if (sk) { /* valid for establish/request sockets */
1305 saddr = sk->sk_rcv_saddr;
1306 daddr = sk->sk_daddr;
1308 const struct iphdr *iph = ip_hdr(skb);
1313 hp = tcp_get_md5sig_pool();
1315 goto clear_hash_noput;
1318 if (crypto_ahash_init(req))
1321 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1323 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1325 if (tcp_md5_hash_key(hp, key))
1327 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1328 if (crypto_ahash_final(req))
1331 tcp_put_md5sig_pool();
1335 tcp_put_md5sig_pool();
1337 memset(md5_hash, 0, 16);
1340 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1344 /* Called with rcu_read_lock() */
1345 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1346 const struct sk_buff *skb,
1349 #ifdef CONFIG_TCP_MD5SIG
1351 * This gets called for each TCP segment that arrives
1352 * so we want to be efficient.
1353 * We have 3 drop cases:
1354 * o No MD5 hash and one expected.
1355 * o MD5 hash and we're not expecting one.
1356 * o MD5 hash and its wrong.
1358 const __u8 *hash_location = NULL;
1359 struct tcp_md5sig_key *hash_expected;
1360 const struct iphdr *iph = ip_hdr(skb);
1361 const struct tcphdr *th = tcp_hdr(skb);
1362 const union tcp_md5_addr *addr;
1363 unsigned char newhash[16];
1364 int genhash, l3index;
1366 /* sdif set, means packet ingressed via a device
1367 * in an L3 domain and dif is set to the l3mdev
1369 l3index = sdif ? dif : 0;
1371 addr = (union tcp_md5_addr *)&iph->saddr;
1372 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1373 hash_location = tcp_parse_md5sig_option(th);
1375 /* We've parsed the options - do we have a hash? */
1376 if (!hash_expected && !hash_location)
1379 if (hash_expected && !hash_location) {
1380 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1384 if (!hash_expected && hash_location) {
1385 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1389 /* Okay, so this is hash_expected and hash_location -
1390 * so we need to calculate the checksum.
1392 genhash = tcp_v4_md5_hash_skb(newhash,
1396 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1397 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1398 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1399 &iph->saddr, ntohs(th->source),
1400 &iph->daddr, ntohs(th->dest),
1401 genhash ? " tcp_v4_calc_md5_hash failed"
1410 static void tcp_v4_init_req(struct request_sock *req,
1411 const struct sock *sk_listener,
1412 struct sk_buff *skb)
1414 struct inet_request_sock *ireq = inet_rsk(req);
1415 struct net *net = sock_net(sk_listener);
1417 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1418 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1419 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1422 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424 const struct request_sock *req)
1426 return inet_csk_route_req(sk, &fl->u.ip4, req);
1429 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1431 .obj_size = sizeof(struct tcp_request_sock),
1432 .rtx_syn_ack = tcp_rtx_synack,
1433 .send_ack = tcp_v4_reqsk_send_ack,
1434 .destructor = tcp_v4_reqsk_destructor,
1435 .send_reset = tcp_v4_send_reset,
1436 .syn_ack_timeout = tcp_syn_ack_timeout,
1439 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1440 .mss_clamp = TCP_MSS_DEFAULT,
1441 #ifdef CONFIG_TCP_MD5SIG
1442 .req_md5_lookup = tcp_v4_md5_lookup,
1443 .calc_md5_hash = tcp_v4_md5_hash_skb,
1445 .init_req = tcp_v4_init_req,
1446 #ifdef CONFIG_SYN_COOKIES
1447 .cookie_init_seq = cookie_v4_init_sequence,
1449 .route_req = tcp_v4_route_req,
1450 .init_seq = tcp_v4_init_seq,
1451 .init_ts_off = tcp_v4_init_ts_off,
1452 .send_synack = tcp_v4_send_synack,
1455 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1457 /* Never answer to SYNs send to broadcast or multicast */
1458 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1461 return tcp_conn_request(&tcp_request_sock_ops,
1462 &tcp_request_sock_ipv4_ops, sk, skb);
1468 EXPORT_SYMBOL(tcp_v4_conn_request);
1472 * The three way handshake has completed - we got a valid synack -
1473 * now create the new socket.
1475 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1476 struct request_sock *req,
1477 struct dst_entry *dst,
1478 struct request_sock *req_unhash,
1481 struct inet_request_sock *ireq;
1482 struct inet_sock *newinet;
1483 struct tcp_sock *newtp;
1485 #ifdef CONFIG_TCP_MD5SIG
1486 const union tcp_md5_addr *addr;
1487 struct tcp_md5sig_key *key;
1490 struct ip_options_rcu *inet_opt;
1492 if (sk_acceptq_is_full(sk))
1495 newsk = tcp_create_openreq_child(sk, req, skb);
1499 newsk->sk_gso_type = SKB_GSO_TCPV4;
1500 inet_sk_rx_dst_set(newsk, skb);
1502 newtp = tcp_sk(newsk);
1503 newinet = inet_sk(newsk);
1504 ireq = inet_rsk(req);
1505 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1506 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1507 newsk->sk_bound_dev_if = ireq->ir_iif;
1508 newinet->inet_saddr = ireq->ir_loc_addr;
1509 inet_opt = rcu_dereference(ireq->ireq_opt);
1510 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1511 newinet->mc_index = inet_iif(skb);
1512 newinet->mc_ttl = ip_hdr(skb)->ttl;
1513 newinet->rcv_tos = ip_hdr(skb)->tos;
1514 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1516 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1517 newinet->inet_id = prandom_u32();
1520 dst = inet_csk_route_child_sock(sk, newsk, req);
1524 /* syncookie case : see end of cookie_v4_check() */
1526 sk_setup_caps(newsk, dst);
1528 tcp_ca_openreq_child(newsk, dst);
1530 tcp_sync_mss(newsk, dst_mtu(dst));
1531 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1533 tcp_initialize_rcv_mss(newsk);
1535 #ifdef CONFIG_TCP_MD5SIG
1536 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1537 /* Copy over the MD5 key from the original socket */
1538 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1539 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1542 * We're using one, so create a matching key
1543 * on the newsk structure. If we fail to get
1544 * memory, then we end up not copying the key
1547 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1548 key->key, key->keylen, GFP_ATOMIC);
1549 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1553 if (__inet_inherit_port(sk, newsk) < 0)
1555 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1556 if (likely(*own_req)) {
1557 tcp_move_syn(newtp, req);
1558 ireq->ireq_opt = NULL;
1560 newinet->inet_opt = NULL;
1565 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1572 newinet->inet_opt = NULL;
1573 inet_csk_prepare_forced_close(newsk);
1577 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1579 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1581 #ifdef CONFIG_SYN_COOKIES
1582 const struct tcphdr *th = tcp_hdr(skb);
1585 sk = cookie_v4_check(sk, skb);
1590 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1591 struct tcphdr *th, u32 *cookie)
1594 #ifdef CONFIG_SYN_COOKIES
1595 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1596 &tcp_request_sock_ipv4_ops, sk, th);
1598 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1599 tcp_synq_overflow(sk);
1605 /* The socket must have it's spinlock held when we get
1606 * here, unless it is a TCP_LISTEN socket.
1608 * We have a potential double-lock case here, so even when
1609 * doing backlog processing we use the BH locking scheme.
1610 * This is because we cannot sleep with the original spinlock
1613 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1617 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1618 struct dst_entry *dst = sk->sk_rx_dst;
1620 sock_rps_save_rxhash(sk, skb);
1621 sk_mark_napi_id(sk, skb);
1623 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1624 !dst->ops->check(dst, 0)) {
1626 sk->sk_rx_dst = NULL;
1629 tcp_rcv_established(sk, skb);
1633 if (tcp_checksum_complete(skb))
1636 if (sk->sk_state == TCP_LISTEN) {
1637 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1642 if (tcp_child_process(sk, nsk, skb)) {
1649 sock_rps_save_rxhash(sk, skb);
1651 if (tcp_rcv_state_process(sk, skb)) {
1658 tcp_v4_send_reset(rsk, skb);
1661 /* Be careful here. If this function gets more complicated and
1662 * gcc suffers from register pressure on the x86, sk (in %ebx)
1663 * might be destroyed here. This current version compiles correctly,
1664 * but you have been warned.
1669 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1670 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1673 EXPORT_SYMBOL(tcp_v4_do_rcv);
1675 int tcp_v4_early_demux(struct sk_buff *skb)
1677 const struct iphdr *iph;
1678 const struct tcphdr *th;
1681 if (skb->pkt_type != PACKET_HOST)
1684 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1690 if (th->doff < sizeof(struct tcphdr) / 4)
1693 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1694 iph->saddr, th->source,
1695 iph->daddr, ntohs(th->dest),
1696 skb->skb_iif, inet_sdif(skb));
1699 skb->destructor = sock_edemux;
1700 if (sk_fullsock(sk)) {
1701 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1704 dst = dst_check(dst, 0);
1706 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1707 skb_dst_set_noref(skb, dst);
1713 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1715 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1716 struct skb_shared_info *shinfo;
1717 const struct tcphdr *th;
1718 struct tcphdr *thtail;
1719 struct sk_buff *tail;
1720 unsigned int hdrlen;
1725 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1726 * we can fix skb->truesize to its real value to avoid future drops.
1727 * This is valid because skb is not yet charged to the socket.
1728 * It has been noticed pure SACK packets were sometimes dropped
1729 * (if cooked by drivers without copybreak feature).
1735 if (unlikely(tcp_checksum_complete(skb))) {
1737 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1738 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1742 /* Attempt coalescing to last skb in backlog, even if we are
1744 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1746 th = (const struct tcphdr *)skb->data;
1747 hdrlen = th->doff * 4;
1748 shinfo = skb_shinfo(skb);
1750 if (!shinfo->gso_size)
1751 shinfo->gso_size = skb->len - hdrlen;
1753 if (!shinfo->gso_segs)
1754 shinfo->gso_segs = 1;
1756 tail = sk->sk_backlog.tail;
1759 thtail = (struct tcphdr *)tail->data;
1761 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1762 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1763 ((TCP_SKB_CB(tail)->tcp_flags |
1764 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1765 !((TCP_SKB_CB(tail)->tcp_flags &
1766 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1767 ((TCP_SKB_CB(tail)->tcp_flags ^
1768 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1769 #ifdef CONFIG_TLS_DEVICE
1770 tail->decrypted != skb->decrypted ||
1772 thtail->doff != th->doff ||
1773 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1776 __skb_pull(skb, hdrlen);
1777 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1778 thtail->window = th->window;
1780 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1782 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1783 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1785 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1786 * thtail->fin, so that the fast path in tcp_rcv_established()
1787 * is not entered if we append a packet with a FIN.
1788 * SYN, RST, URG are not present.
1789 * ACK is set on both packets.
1790 * PSH : we do not really care in TCP stack,
1791 * at least for 'GRO' packets.
1793 thtail->fin |= th->fin;
1794 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1796 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1797 TCP_SKB_CB(tail)->has_rxtstamp = true;
1798 tail->tstamp = skb->tstamp;
1799 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1802 /* Not as strict as GRO. We only need to carry mss max value */
1803 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1804 skb_shinfo(tail)->gso_size);
1806 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1807 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1809 sk->sk_backlog.len += delta;
1810 __NET_INC_STATS(sock_net(sk),
1811 LINUX_MIB_TCPBACKLOGCOALESCE);
1812 kfree_skb_partial(skb, fragstolen);
1815 __skb_push(skb, hdrlen);
1818 /* Only socket owner can try to collapse/prune rx queues
1819 * to reduce memory overhead, so add a little headroom here.
1820 * Few sockets backlog are possibly concurrently non empty.
1824 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1826 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1831 EXPORT_SYMBOL(tcp_add_backlog);
1833 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1835 struct tcphdr *th = (struct tcphdr *)skb->data;
1837 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1839 EXPORT_SYMBOL(tcp_filter);
1841 static void tcp_v4_restore_cb(struct sk_buff *skb)
1843 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1844 sizeof(struct inet_skb_parm));
1847 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1848 const struct tcphdr *th)
1850 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1851 * barrier() makes sure compiler wont play fool^Waliasing games.
1853 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1854 sizeof(struct inet_skb_parm));
1857 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1858 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1859 skb->len - th->doff * 4);
1860 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1861 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1862 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1863 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1864 TCP_SKB_CB(skb)->sacked = 0;
1865 TCP_SKB_CB(skb)->has_rxtstamp =
1866 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1873 int tcp_v4_rcv(struct sk_buff *skb)
1875 struct net *net = dev_net(skb->dev);
1876 struct sk_buff *skb_to_free;
1877 int sdif = inet_sdif(skb);
1878 int dif = inet_iif(skb);
1879 const struct iphdr *iph;
1880 const struct tcphdr *th;
1885 if (skb->pkt_type != PACKET_HOST)
1888 /* Count it even if it's bad */
1889 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1891 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1894 th = (const struct tcphdr *)skb->data;
1896 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1898 if (!pskb_may_pull(skb, th->doff * 4))
1901 /* An explanation is required here, I think.
1902 * Packet length and doff are validated by header prediction,
1903 * provided case of th->doff==0 is eliminated.
1904 * So, we defer the checks. */
1906 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1909 th = (const struct tcphdr *)skb->data;
1912 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1913 th->dest, sdif, &refcounted);
1918 if (sk->sk_state == TCP_TIME_WAIT)
1921 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1922 struct request_sock *req = inet_reqsk(sk);
1923 bool req_stolen = false;
1926 sk = req->rsk_listener;
1927 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1928 sk_drops_add(sk, skb);
1932 if (tcp_checksum_complete(skb)) {
1936 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1937 inet_csk_reqsk_queue_drop_and_put(sk, req);
1940 /* We own a reference on the listener, increase it again
1941 * as we might lose it too soon.
1946 if (!tcp_filter(sk, skb)) {
1947 th = (const struct tcphdr *)skb->data;
1949 tcp_v4_fill_cb(skb, iph, th);
1950 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1955 /* Another cpu got exclusive access to req
1956 * and created a full blown socket.
1957 * Try to feed this packet to this socket
1958 * instead of discarding it.
1960 tcp_v4_restore_cb(skb);
1964 goto discard_and_relse;
1968 tcp_v4_restore_cb(skb);
1969 } else if (tcp_child_process(sk, nsk, skb)) {
1970 tcp_v4_send_reset(nsk, skb);
1971 goto discard_and_relse;
1977 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1978 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1979 goto discard_and_relse;
1982 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1983 goto discard_and_relse;
1985 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1986 goto discard_and_relse;
1990 if (tcp_filter(sk, skb))
1991 goto discard_and_relse;
1992 th = (const struct tcphdr *)skb->data;
1994 tcp_v4_fill_cb(skb, iph, th);
1998 if (sk->sk_state == TCP_LISTEN) {
1999 ret = tcp_v4_do_rcv(sk, skb);
2000 goto put_and_return;
2003 sk_incoming_cpu_update(sk);
2005 bh_lock_sock_nested(sk);
2006 tcp_segs_in(tcp_sk(sk), skb);
2008 if (!sock_owned_by_user(sk)) {
2009 skb_to_free = sk->sk_rx_skb_cache;
2010 sk->sk_rx_skb_cache = NULL;
2011 ret = tcp_v4_do_rcv(sk, skb);
2013 if (tcp_add_backlog(sk, skb))
2014 goto discard_and_relse;
2019 __kfree_skb(skb_to_free);
2028 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2031 tcp_v4_fill_cb(skb, iph, th);
2033 if (tcp_checksum_complete(skb)) {
2035 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2037 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2039 tcp_v4_send_reset(NULL, skb);
2043 /* Discard frame. */
2048 sk_drops_add(sk, skb);
2054 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2055 inet_twsk_put(inet_twsk(sk));
2059 tcp_v4_fill_cb(skb, iph, th);
2061 if (tcp_checksum_complete(skb)) {
2062 inet_twsk_put(inet_twsk(sk));
2065 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2067 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2070 iph->saddr, th->source,
2071 iph->daddr, th->dest,
2075 inet_twsk_deschedule_put(inet_twsk(sk));
2077 tcp_v4_restore_cb(skb);
2085 tcp_v4_timewait_ack(sk, skb);
2088 tcp_v4_send_reset(sk, skb);
2089 inet_twsk_deschedule_put(inet_twsk(sk));
2091 case TCP_TW_SUCCESS:;
2096 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2097 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2098 .twsk_unique = tcp_twsk_unique,
2099 .twsk_destructor= tcp_twsk_destructor,
2102 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2104 struct dst_entry *dst = skb_dst(skb);
2106 if (dst && dst_hold_safe(dst)) {
2107 sk->sk_rx_dst = dst;
2108 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2111 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2113 const struct inet_connection_sock_af_ops ipv4_specific = {
2114 .queue_xmit = ip_queue_xmit,
2115 .send_check = tcp_v4_send_check,
2116 .rebuild_header = inet_sk_rebuild_header,
2117 .sk_rx_dst_set = inet_sk_rx_dst_set,
2118 .conn_request = tcp_v4_conn_request,
2119 .syn_recv_sock = tcp_v4_syn_recv_sock,
2120 .net_header_len = sizeof(struct iphdr),
2121 .setsockopt = ip_setsockopt,
2122 .getsockopt = ip_getsockopt,
2123 .addr2sockaddr = inet_csk_addr2sockaddr,
2124 .sockaddr_len = sizeof(struct sockaddr_in),
2125 #ifdef CONFIG_COMPAT
2126 .compat_setsockopt = compat_ip_setsockopt,
2127 .compat_getsockopt = compat_ip_getsockopt,
2129 .mtu_reduced = tcp_v4_mtu_reduced,
2131 EXPORT_SYMBOL(ipv4_specific);
2133 #ifdef CONFIG_TCP_MD5SIG
2134 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2135 .md5_lookup = tcp_v4_md5_lookup,
2136 .calc_md5_hash = tcp_v4_md5_hash_skb,
2137 .md5_parse = tcp_v4_parse_md5_keys,
2141 /* NOTE: A lot of things set to zero explicitly by call to
2142 * sk_alloc() so need not be done here.
2144 static int tcp_v4_init_sock(struct sock *sk)
2146 struct inet_connection_sock *icsk = inet_csk(sk);
2150 icsk->icsk_af_ops = &ipv4_specific;
2152 #ifdef CONFIG_TCP_MD5SIG
2153 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2159 void tcp_v4_destroy_sock(struct sock *sk)
2161 struct tcp_sock *tp = tcp_sk(sk);
2163 trace_tcp_destroy_sock(sk);
2165 tcp_clear_xmit_timers(sk);
2167 tcp_cleanup_congestion_control(sk);
2169 tcp_cleanup_ulp(sk);
2171 /* Cleanup up the write buffer. */
2172 tcp_write_queue_purge(sk);
2174 /* Check if we want to disable active TFO */
2175 tcp_fastopen_active_disable_ofo_check(sk);
2177 /* Cleans up our, hopefully empty, out_of_order_queue. */
2178 skb_rbtree_purge(&tp->out_of_order_queue);
2180 #ifdef CONFIG_TCP_MD5SIG
2181 /* Clean up the MD5 key list, if any */
2182 if (tp->md5sig_info) {
2183 tcp_clear_md5_list(sk);
2184 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2185 tp->md5sig_info = NULL;
2189 /* Clean up a referenced TCP bind bucket. */
2190 if (inet_csk(sk)->icsk_bind_hash)
2193 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2195 /* If socket is aborted during connect operation */
2196 tcp_free_fastopen_req(tp);
2197 tcp_fastopen_destroy_cipher(sk);
2198 tcp_saved_syn_free(tp);
2200 sk_sockets_allocated_dec(sk);
2202 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2204 #ifdef CONFIG_PROC_FS
2205 /* Proc filesystem TCP sock list dumping. */
2208 * Get next listener socket follow cur. If cur is NULL, get first socket
2209 * starting from bucket given in st->bucket; when st->bucket is zero the
2210 * very first socket in the hash table is returned.
2212 static void *listening_get_next(struct seq_file *seq, void *cur)
2214 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2215 struct tcp_iter_state *st = seq->private;
2216 struct net *net = seq_file_net(seq);
2217 struct inet_listen_hashbucket *ilb;
2218 struct hlist_nulls_node *node;
2219 struct sock *sk = cur;
2223 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2224 spin_lock(&ilb->lock);
2225 sk = sk_nulls_head(&ilb->nulls_head);
2229 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2233 sk = sk_nulls_next(sk);
2235 sk_nulls_for_each_from(sk, node) {
2236 if (!net_eq(sock_net(sk), net))
2238 if (sk->sk_family == afinfo->family)
2241 spin_unlock(&ilb->lock);
2243 if (++st->bucket < INET_LHTABLE_SIZE)
2248 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2250 struct tcp_iter_state *st = seq->private;
2255 rc = listening_get_next(seq, NULL);
2257 while (rc && *pos) {
2258 rc = listening_get_next(seq, rc);
2264 static inline bool empty_bucket(const struct tcp_iter_state *st)
2266 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2270 * Get first established socket starting from bucket given in st->bucket.
2271 * If st->bucket is zero, the very first socket in the hash is returned.
2273 static void *established_get_first(struct seq_file *seq)
2275 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2276 struct tcp_iter_state *st = seq->private;
2277 struct net *net = seq_file_net(seq);
2281 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2283 struct hlist_nulls_node *node;
2284 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2286 /* Lockless fast path for the common case of empty buckets */
2287 if (empty_bucket(st))
2291 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2292 if (sk->sk_family != afinfo->family ||
2293 !net_eq(sock_net(sk), net)) {
2299 spin_unlock_bh(lock);
2305 static void *established_get_next(struct seq_file *seq, void *cur)
2307 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2308 struct sock *sk = cur;
2309 struct hlist_nulls_node *node;
2310 struct tcp_iter_state *st = seq->private;
2311 struct net *net = seq_file_net(seq);
2316 sk = sk_nulls_next(sk);
2318 sk_nulls_for_each_from(sk, node) {
2319 if (sk->sk_family == afinfo->family &&
2320 net_eq(sock_net(sk), net))
2324 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2326 return established_get_first(seq);
2329 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2331 struct tcp_iter_state *st = seq->private;
2335 rc = established_get_first(seq);
2338 rc = established_get_next(seq, rc);
2344 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2347 struct tcp_iter_state *st = seq->private;
2349 st->state = TCP_SEQ_STATE_LISTENING;
2350 rc = listening_get_idx(seq, &pos);
2353 st->state = TCP_SEQ_STATE_ESTABLISHED;
2354 rc = established_get_idx(seq, pos);
2360 static void *tcp_seek_last_pos(struct seq_file *seq)
2362 struct tcp_iter_state *st = seq->private;
2363 int offset = st->offset;
2364 int orig_num = st->num;
2367 switch (st->state) {
2368 case TCP_SEQ_STATE_LISTENING:
2369 if (st->bucket >= INET_LHTABLE_SIZE)
2371 st->state = TCP_SEQ_STATE_LISTENING;
2372 rc = listening_get_next(seq, NULL);
2373 while (offset-- && rc)
2374 rc = listening_get_next(seq, rc);
2378 st->state = TCP_SEQ_STATE_ESTABLISHED;
2380 case TCP_SEQ_STATE_ESTABLISHED:
2381 if (st->bucket > tcp_hashinfo.ehash_mask)
2383 rc = established_get_first(seq);
2384 while (offset-- && rc)
2385 rc = established_get_next(seq, rc);
2393 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2395 struct tcp_iter_state *st = seq->private;
2398 if (*pos && *pos == st->last_pos) {
2399 rc = tcp_seek_last_pos(seq);
2404 st->state = TCP_SEQ_STATE_LISTENING;
2408 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2411 st->last_pos = *pos;
2414 EXPORT_SYMBOL(tcp_seq_start);
2416 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2418 struct tcp_iter_state *st = seq->private;
2421 if (v == SEQ_START_TOKEN) {
2422 rc = tcp_get_idx(seq, 0);
2426 switch (st->state) {
2427 case TCP_SEQ_STATE_LISTENING:
2428 rc = listening_get_next(seq, v);
2430 st->state = TCP_SEQ_STATE_ESTABLISHED;
2433 rc = established_get_first(seq);
2436 case TCP_SEQ_STATE_ESTABLISHED:
2437 rc = established_get_next(seq, v);
2442 st->last_pos = *pos;
2445 EXPORT_SYMBOL(tcp_seq_next);
2447 void tcp_seq_stop(struct seq_file *seq, void *v)
2449 struct tcp_iter_state *st = seq->private;
2451 switch (st->state) {
2452 case TCP_SEQ_STATE_LISTENING:
2453 if (v != SEQ_START_TOKEN)
2454 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2456 case TCP_SEQ_STATE_ESTABLISHED:
2458 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2462 EXPORT_SYMBOL(tcp_seq_stop);
2464 static void get_openreq4(const struct request_sock *req,
2465 struct seq_file *f, int i)
2467 const struct inet_request_sock *ireq = inet_rsk(req);
2468 long delta = req->rsk_timer.expires - jiffies;
2470 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2471 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2476 ntohs(ireq->ir_rmt_port),
2478 0, 0, /* could print option size, but that is af dependent. */
2479 1, /* timers active (only the expire timer) */
2480 jiffies_delta_to_clock_t(delta),
2482 from_kuid_munged(seq_user_ns(f),
2483 sock_i_uid(req->rsk_listener)),
2484 0, /* non standard timer */
2485 0, /* open_requests have no inode */
2490 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2493 unsigned long timer_expires;
2494 const struct tcp_sock *tp = tcp_sk(sk);
2495 const struct inet_connection_sock *icsk = inet_csk(sk);
2496 const struct inet_sock *inet = inet_sk(sk);
2497 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2498 __be32 dest = inet->inet_daddr;
2499 __be32 src = inet->inet_rcv_saddr;
2500 __u16 destp = ntohs(inet->inet_dport);
2501 __u16 srcp = ntohs(inet->inet_sport);
2505 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2506 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2507 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2509 timer_expires = icsk->icsk_timeout;
2510 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2512 timer_expires = icsk->icsk_timeout;
2513 } else if (timer_pending(&sk->sk_timer)) {
2515 timer_expires = sk->sk_timer.expires;
2518 timer_expires = jiffies;
2521 state = inet_sk_state_load(sk);
2522 if (state == TCP_LISTEN)
2523 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2525 /* Because we don't lock the socket,
2526 * we might find a transient negative value.
2528 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2529 READ_ONCE(tp->copied_seq), 0);
2531 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2532 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2533 i, src, srcp, dest, destp, state,
2534 READ_ONCE(tp->write_seq) - tp->snd_una,
2537 jiffies_delta_to_clock_t(timer_expires - jiffies),
2538 icsk->icsk_retransmits,
2539 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2540 icsk->icsk_probes_out,
2542 refcount_read(&sk->sk_refcnt), sk,
2543 jiffies_to_clock_t(icsk->icsk_rto),
2544 jiffies_to_clock_t(icsk->icsk_ack.ato),
2545 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2547 state == TCP_LISTEN ?
2548 fastopenq->max_qlen :
2549 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2552 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2553 struct seq_file *f, int i)
2555 long delta = tw->tw_timer.expires - jiffies;
2559 dest = tw->tw_daddr;
2560 src = tw->tw_rcv_saddr;
2561 destp = ntohs(tw->tw_dport);
2562 srcp = ntohs(tw->tw_sport);
2564 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2565 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2566 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2567 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2568 refcount_read(&tw->tw_refcnt), tw);
2573 static int tcp4_seq_show(struct seq_file *seq, void *v)
2575 struct tcp_iter_state *st;
2576 struct sock *sk = v;
2578 seq_setwidth(seq, TMPSZ - 1);
2579 if (v == SEQ_START_TOKEN) {
2580 seq_puts(seq, " sl local_address rem_address st tx_queue "
2581 "rx_queue tr tm->when retrnsmt uid timeout "
2587 if (sk->sk_state == TCP_TIME_WAIT)
2588 get_timewait4_sock(v, seq, st->num);
2589 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2590 get_openreq4(v, seq, st->num);
2592 get_tcp4_sock(v, seq, st->num);
2598 static const struct seq_operations tcp4_seq_ops = {
2599 .show = tcp4_seq_show,
2600 .start = tcp_seq_start,
2601 .next = tcp_seq_next,
2602 .stop = tcp_seq_stop,
2605 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2609 static int __net_init tcp4_proc_init_net(struct net *net)
2611 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2612 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2617 static void __net_exit tcp4_proc_exit_net(struct net *net)
2619 remove_proc_entry("tcp", net->proc_net);
2622 static struct pernet_operations tcp4_net_ops = {
2623 .init = tcp4_proc_init_net,
2624 .exit = tcp4_proc_exit_net,
2627 int __init tcp4_proc_init(void)
2629 return register_pernet_subsys(&tcp4_net_ops);
2632 void tcp4_proc_exit(void)
2634 unregister_pernet_subsys(&tcp4_net_ops);
2636 #endif /* CONFIG_PROC_FS */
2638 struct proto tcp_prot = {
2640 .owner = THIS_MODULE,
2642 .pre_connect = tcp_v4_pre_connect,
2643 .connect = tcp_v4_connect,
2644 .disconnect = tcp_disconnect,
2645 .accept = inet_csk_accept,
2647 .init = tcp_v4_init_sock,
2648 .destroy = tcp_v4_destroy_sock,
2649 .shutdown = tcp_shutdown,
2650 .setsockopt = tcp_setsockopt,
2651 .getsockopt = tcp_getsockopt,
2652 .keepalive = tcp_set_keepalive,
2653 .recvmsg = tcp_recvmsg,
2654 .sendmsg = tcp_sendmsg,
2655 .sendpage = tcp_sendpage,
2656 .backlog_rcv = tcp_v4_do_rcv,
2657 .release_cb = tcp_release_cb,
2659 .unhash = inet_unhash,
2660 .get_port = inet_csk_get_port,
2661 .enter_memory_pressure = tcp_enter_memory_pressure,
2662 .leave_memory_pressure = tcp_leave_memory_pressure,
2663 .stream_memory_free = tcp_stream_memory_free,
2664 .sockets_allocated = &tcp_sockets_allocated,
2665 .orphan_count = &tcp_orphan_count,
2666 .memory_allocated = &tcp_memory_allocated,
2667 .memory_pressure = &tcp_memory_pressure,
2668 .sysctl_mem = sysctl_tcp_mem,
2669 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2670 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2671 .max_header = MAX_TCP_HEADER,
2672 .obj_size = sizeof(struct tcp_sock),
2673 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2674 .twsk_prot = &tcp_timewait_sock_ops,
2675 .rsk_prot = &tcp_request_sock_ops,
2676 .h.hashinfo = &tcp_hashinfo,
2677 .no_autobind = true,
2678 #ifdef CONFIG_COMPAT
2679 .compat_setsockopt = compat_tcp_setsockopt,
2680 .compat_getsockopt = compat_tcp_getsockopt,
2682 .diag_destroy = tcp_abort,
2684 EXPORT_SYMBOL(tcp_prot);
2686 static void __net_exit tcp_sk_exit(struct net *net)
2690 if (net->ipv4.tcp_congestion_control)
2691 bpf_module_put(net->ipv4.tcp_congestion_control,
2692 net->ipv4.tcp_congestion_control->owner);
2694 for_each_possible_cpu(cpu)
2695 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2696 free_percpu(net->ipv4.tcp_sk);
2699 static int __net_init tcp_sk_init(struct net *net)
2703 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2704 if (!net->ipv4.tcp_sk)
2707 for_each_possible_cpu(cpu) {
2710 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2714 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2716 /* Please enforce IP_DF and IPID==0 for RST and
2717 * ACK sent in SYN-RECV and TIME-WAIT state.
2719 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2721 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2724 net->ipv4.sysctl_tcp_ecn = 2;
2725 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2727 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2728 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2729 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2730 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2731 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2733 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2734 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2735 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2737 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2738 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2739 net->ipv4.sysctl_tcp_syncookies = 1;
2740 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2741 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2742 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2743 net->ipv4.sysctl_tcp_orphan_retries = 0;
2744 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2745 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2746 net->ipv4.sysctl_tcp_tw_reuse = 2;
2747 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2749 cnt = tcp_hashinfo.ehash_mask + 1;
2750 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2751 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2753 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2754 net->ipv4.sysctl_tcp_sack = 1;
2755 net->ipv4.sysctl_tcp_window_scaling = 1;
2756 net->ipv4.sysctl_tcp_timestamps = 1;
2757 net->ipv4.sysctl_tcp_early_retrans = 3;
2758 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2759 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2760 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2761 net->ipv4.sysctl_tcp_max_reordering = 300;
2762 net->ipv4.sysctl_tcp_dsack = 1;
2763 net->ipv4.sysctl_tcp_app_win = 31;
2764 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2765 net->ipv4.sysctl_tcp_frto = 2;
2766 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2767 /* This limits the percentage of the congestion window which we
2768 * will allow a single TSO frame to consume. Building TSO frames
2769 * which are too large can cause TCP streams to be bursty.
2771 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2772 /* Default TSQ limit of 16 TSO segments */
2773 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2774 /* rfc5961 challenge ack rate limiting */
2775 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2776 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2777 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2778 net->ipv4.sysctl_tcp_autocorking = 1;
2779 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2780 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2781 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2782 if (net != &init_net) {
2783 memcpy(net->ipv4.sysctl_tcp_rmem,
2784 init_net.ipv4.sysctl_tcp_rmem,
2785 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2786 memcpy(net->ipv4.sysctl_tcp_wmem,
2787 init_net.ipv4.sysctl_tcp_wmem,
2788 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2790 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2791 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2792 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2793 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2794 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2795 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2796 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2798 /* Reno is always built in */
2799 if (!net_eq(net, &init_net) &&
2800 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2801 init_net.ipv4.tcp_congestion_control->owner))
2802 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2804 net->ipv4.tcp_congestion_control = &tcp_reno;
2813 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2817 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2819 list_for_each_entry(net, net_exit_list, exit_list)
2820 tcp_fastopen_ctx_destroy(net);
2823 static struct pernet_operations __net_initdata tcp_sk_ops = {
2824 .init = tcp_sk_init,
2825 .exit = tcp_sk_exit,
2826 .exit_batch = tcp_sk_exit_batch,
2829 void __init tcp_v4_init(void)
2831 if (register_pernet_subsys(&tcp_sk_ops))
2832 panic("Failed to create the TCP control socket.\n");