2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
88 #include <trace/events/tcp.h>
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
100 return secure_tcp_seq(ip_hdr(skb)->daddr,
103 tcp_hdr(skb)->source);
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113 const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 struct tcp_sock *tp = tcp_sk(sk);
116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
119 /* Still does not detect *everything* that goes through
120 * lo, since we require a loopback src or dst address
121 * or direct binding to 'lo' interface.
123 bool loopback = false;
124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
126 #if IS_ENABLED(CONFIG_IPV6)
127 if (tw->tw_family == AF_INET6) {
128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
138 if (ipv4_is_loopback(tw->tw_daddr) ||
139 ipv4_is_loopback(tw->tw_rcv_saddr))
146 /* With PAWS, it is safe from the viewpoint
147 of data integrity. Even without PAWS it is safe provided sequence
148 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
150 Actually, the idea is close to VJ's one, only timestamp cache is
151 held not per host, but per port pair and TW bucket is used as state
154 If TW bucket has been already destroyed we fall back to VJ's scheme
155 and use initial timestamp retrieved from peer table.
157 if (tcptw->tw_ts_recent_stamp &&
158 (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
159 /* In case of repair and re-using TIME-WAIT sockets we still
160 * want to be sure that it is safe as above but honor the
161 * sequence numbers and time stamps set as part of the repair
164 * Without this check re-using a TIME-WAIT socket with TCP
165 * repair would accumulate a -1 on the repair assigned
166 * sequence number. The first time it is reused the sequence
167 * is -1, the second time -2, etc. This fixes that issue
168 * without appearing to create any others.
170 if (likely(!tp->repair)) {
171 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
172 if (tp->write_seq == 0)
174 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
175 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
188 /* This check is replicated from tcp_v4_connect() and intended to
189 * prevent BPF program called below from accessing bytes that are out
190 * of the bound specified by user in addr_len.
192 if (addr_len < sizeof(struct sockaddr_in))
195 sock_owned_by_me(sk);
197 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 struct inet_sock *inet = inet_sk(sk);
205 struct tcp_sock *tp = tcp_sk(sk);
206 __be16 orig_sport, orig_dport;
207 __be32 daddr, nexthop;
211 struct ip_options_rcu *inet_opt;
212 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
214 if (addr_len < sizeof(struct sockaddr_in))
217 if (usin->sin_family != AF_INET)
218 return -EAFNOSUPPORT;
220 nexthop = daddr = usin->sin_addr.s_addr;
221 inet_opt = rcu_dereference_protected(inet->inet_opt,
222 lockdep_sock_is_held(sk));
223 if (inet_opt && inet_opt->opt.srr) {
226 nexthop = inet_opt->opt.faddr;
229 orig_sport = inet->inet_sport;
230 orig_dport = usin->sin_port;
231 fl4 = &inet->cork.fl.u.ip4;
232 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
235 orig_sport, orig_dport, sk);
238 if (err == -ENETUNREACH)
239 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
248 if (!inet_opt || !inet_opt->opt.srr)
251 if (!inet->inet_saddr)
252 inet->inet_saddr = fl4->saddr;
253 sk_rcv_saddr_set(sk, inet->inet_saddr);
255 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
256 /* Reset inherited state */
257 tp->rx_opt.ts_recent = 0;
258 tp->rx_opt.ts_recent_stamp = 0;
259 if (likely(!tp->repair))
263 inet->inet_dport = usin->sin_port;
264 sk_daddr_set(sk, daddr);
266 inet_csk(sk)->icsk_ext_hdr_len = 0;
268 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
270 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
272 /* Socket identity is still unknown (sport may be zero).
273 * However we set state to SYN-SENT and not releasing socket
274 * lock select source port, enter ourselves into the hash tables and
275 * complete initialization after this.
277 tcp_set_state(sk, TCP_SYN_SENT);
278 err = inet_hash_connect(tcp_death_row, sk);
284 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
285 inet->inet_sport, inet->inet_dport, sk);
291 /* OK, now commit destination to socket. */
292 sk->sk_gso_type = SKB_GSO_TCPV4;
293 sk_setup_caps(sk, &rt->dst);
296 if (likely(!tp->repair)) {
298 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
307 inet->inet_id = tp->write_seq ^ jiffies;
309 if (tcp_fastopen_defer_connect(sk, &err))
314 err = tcp_connect(sk);
323 * This unhashes the socket and releases the local port,
326 tcp_set_state(sk, TCP_CLOSE);
328 sk->sk_route_caps = 0;
329 inet->inet_dport = 0;
332 EXPORT_SYMBOL(tcp_v4_connect);
335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
336 * It can be called through tcp_release_cb() if socket was owned by user
337 * at the time tcp_v4_err() was called to handle ICMP message.
339 void tcp_v4_mtu_reduced(struct sock *sk)
341 struct inet_sock *inet = inet_sk(sk);
342 struct dst_entry *dst;
345 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 mtu = tcp_sk(sk)->mtu_info;
348 dst = inet_csk_update_pmtu(sk, mtu);
352 /* Something is about to be wrong... Remember soft error
353 * for the case, if this connection will not able to recover.
355 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
356 sk->sk_err_soft = EMSGSIZE;
360 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
361 ip_sk_accept_pmtu(sk) &&
362 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
363 tcp_sync_mss(sk, mtu);
365 /* Resend the TCP packet because it's
366 * clear that the old packet has been
367 * dropped. This is the new "fast" path mtu
370 tcp_simple_retransmit(sk);
371 } /* else let the usual retransmit timer handle it */
373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
377 struct dst_entry *dst = __sk_dst_check(sk, 0);
380 dst->ops->redirect(dst, sk, skb);
384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
387 struct request_sock *req = inet_reqsk(sk);
388 struct net *net = sock_net(sk);
390 /* ICMPs are not backlogged, hence we cannot get
391 * an established socket here.
393 if (seq != tcp_rsk(req)->snt_isn) {
394 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
397 * Still in SYN_RECV, just remove it silently.
398 * There is no good way to pass the error to the newly
399 * created socket, and POSIX does not want network
400 * errors returned from accept().
402 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
403 tcp_listendrop(req->rsk_listener);
407 EXPORT_SYMBOL(tcp_req_err);
410 * This routine is called by the ICMP module when it gets some
411 * sort of error condition. If err < 0 then the socket should
412 * be closed and the error returned to the user. If err > 0
413 * it's just the icmp type << 8 | icmp code. After adjustment
414 * header points to the first 8 bytes of the tcp header. We need
415 * to find the appropriate port.
417 * The locking strategy used here is very "optimistic". When
418 * someone else accesses the socket the ICMP is just dropped
419 * and for some paths there is no check at all.
420 * A more general error queue to queue errors for later handling
421 * is probably better.
425 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
427 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
428 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
429 struct inet_connection_sock *icsk;
431 struct inet_sock *inet;
432 const int type = icmp_hdr(icmp_skb)->type;
433 const int code = icmp_hdr(icmp_skb)->code;
436 struct request_sock *fastopen;
441 struct net *net = dev_net(icmp_skb->dev);
443 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
444 th->dest, iph->saddr, ntohs(th->source),
445 inet_iif(icmp_skb), 0);
447 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
450 if (sk->sk_state == TCP_TIME_WAIT) {
451 inet_twsk_put(inet_twsk(sk));
454 seq = ntohl(th->seq);
455 if (sk->sk_state == TCP_NEW_SYN_RECV)
456 return tcp_req_err(sk, seq,
457 type == ICMP_PARAMETERPROB ||
458 type == ICMP_TIME_EXCEEDED ||
459 (type == ICMP_DEST_UNREACH &&
460 (code == ICMP_NET_UNREACH ||
461 code == ICMP_HOST_UNREACH)));
464 /* If too many ICMPs get dropped on busy
465 * servers this needs to be solved differently.
466 * We do take care of PMTU discovery (RFC1191) special case :
467 * we can receive locally generated ICMP messages while socket is held.
469 if (sock_owned_by_user(sk)) {
470 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
471 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
473 if (sk->sk_state == TCP_CLOSE)
476 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
477 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
483 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
484 fastopen = tp->fastopen_rsk;
485 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
486 if (sk->sk_state != TCP_LISTEN &&
487 !between(seq, snd_una, tp->snd_nxt)) {
488 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
494 if (!sock_owned_by_user(sk))
495 do_redirect(icmp_skb, sk);
497 case ICMP_SOURCE_QUENCH:
498 /* Just silently ignore these. */
500 case ICMP_PARAMETERPROB:
503 case ICMP_DEST_UNREACH:
504 if (code > NR_ICMP_UNREACH)
507 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
508 /* We are not interested in TCP_LISTEN and open_requests
509 * (SYN-ACKs send out by Linux are always <576bytes so
510 * they should go through unfragmented).
512 if (sk->sk_state == TCP_LISTEN)
516 if (!sock_owned_by_user(sk)) {
517 tcp_v4_mtu_reduced(sk);
519 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
525 err = icmp_err_convert[code].errno;
526 /* check if icmp_skb allows revert of backoff
527 * (see draft-zimmermann-tcp-lcd) */
528 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
530 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
531 !icsk->icsk_backoff || fastopen)
534 if (sock_owned_by_user(sk))
537 icsk->icsk_backoff--;
538 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
540 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
542 skb = tcp_rtx_queue_head(sk);
545 tcp_mstamp_refresh(tp);
546 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
547 remaining = icsk->icsk_rto -
548 usecs_to_jiffies(delta_us);
551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 remaining, TCP_RTO_MAX);
554 /* RTO revert clocked out retransmission.
555 * Will retransmit now */
556 tcp_retransmit_timer(sk);
560 case ICMP_TIME_EXCEEDED:
567 switch (sk->sk_state) {
570 /* Only in fast or simultaneous open. If a fast open socket is
571 * is already accepted it is treated as a connected one below.
573 if (fastopen && !fastopen->sk)
576 if (!sock_owned_by_user(sk)) {
579 sk->sk_error_report(sk);
583 sk->sk_err_soft = err;
588 /* If we've already connected we will keep trying
589 * until we time out, or the user gives up.
591 * rfc1122 4.2.3.9 allows to consider as hard errors
592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 * but it is obsoleted by pmtu discovery).
595 * Note, that in modern internet, where routing is unreliable
596 * and in each dark corner broken firewalls sit, sending random
597 * errors ordered by their masters even this two messages finally lose
598 * their original sense (even Linux sends invalid PORT_UNREACHs)
600 * Now we are in compliance with RFCs.
605 if (!sock_owned_by_user(sk) && inet->recverr) {
607 sk->sk_error_report(sk);
608 } else { /* Only an error on timeout */
609 sk->sk_err_soft = err;
617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
619 struct tcphdr *th = tcp_hdr(skb);
621 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
622 skb->csum_start = skb_transport_header(skb) - skb->head;
623 skb->csum_offset = offsetof(struct tcphdr, check);
626 /* This routine computes an IPv4 TCP checksum. */
627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
629 const struct inet_sock *inet = inet_sk(sk);
631 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633 EXPORT_SYMBOL(tcp_v4_send_check);
636 * This routine will send an RST to the other tcp.
638 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640 * Answer: if a packet caused RST, it is not for a socket
641 * existing in our system, if it is matched to a socket,
642 * it is just duplicate segment or bug in other side's TCP.
643 * So that we build reply only basing on parameters
644 * arrived with segment.
645 * Exception: precedence violation. We do not implement it in any case.
648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
650 const struct tcphdr *th = tcp_hdr(skb);
653 #ifdef CONFIG_TCP_MD5SIG
654 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
657 struct ip_reply_arg arg;
658 #ifdef CONFIG_TCP_MD5SIG
659 struct tcp_md5sig_key *key = NULL;
660 const __u8 *hash_location = NULL;
661 unsigned char newhash[16];
663 struct sock *sk1 = NULL;
668 /* Never send a reset in response to a reset. */
672 /* If sk not NULL, it means we did a successful lookup and incoming
673 * route had to be correct. prequeue might have dropped our dst.
675 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
678 /* Swap the send and the receive. */
679 memset(&rep, 0, sizeof(rep));
680 rep.th.dest = th->source;
681 rep.th.source = th->dest;
682 rep.th.doff = sizeof(struct tcphdr) / 4;
686 rep.th.seq = th->ack_seq;
689 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
690 skb->len - (th->doff << 2));
693 memset(&arg, 0, sizeof(arg));
694 arg.iov[0].iov_base = (unsigned char *)&rep;
695 arg.iov[0].iov_len = sizeof(rep.th);
697 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
698 #ifdef CONFIG_TCP_MD5SIG
700 hash_location = tcp_parse_md5sig_option(th);
701 if (sk && sk_fullsock(sk)) {
702 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
703 &ip_hdr(skb)->saddr, AF_INET);
704 } else if (hash_location) {
706 * active side is lost. Try to find listening socket through
707 * source port, and then find md5 key through listening socket.
708 * we are not loose security here:
709 * Incoming packet is checked with md5 hash with finding key,
710 * no RST generated if md5 hash doesn't match.
712 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
714 th->source, ip_hdr(skb)->daddr,
715 ntohs(th->source), inet_iif(skb),
717 /* don't send rst if it can't find key */
721 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
722 &ip_hdr(skb)->saddr, AF_INET);
727 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
728 if (genhash || memcmp(hash_location, newhash, 16) != 0)
734 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
736 (TCPOPT_MD5SIG << 8) |
738 /* Update length and the length the header thinks exists */
739 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
740 rep.th.doff = arg.iov[0].iov_len / 4;
742 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
743 key, ip_hdr(skb)->saddr,
744 ip_hdr(skb)->daddr, &rep.th);
747 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
748 ip_hdr(skb)->saddr, /* XXX */
749 arg.iov[0].iov_len, IPPROTO_TCP, 0);
750 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
751 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
753 /* When socket is gone, all binding information is lost.
754 * routing might fail in this case. No choice here, if we choose to force
755 * input interface, we will misroute in case of asymmetric route.
758 arg.bound_dev_if = sk->sk_bound_dev_if;
760 trace_tcp_send_reset(sk, skb);
763 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
764 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
766 arg.tos = ip_hdr(skb)->tos;
767 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
769 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
771 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
772 inet_twsk(sk)->tw_mark : sk->sk_mark;
773 ip_send_unicast_reply(ctl_sk,
774 skb, &TCP_SKB_CB(skb)->header.h4.opt,
775 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
776 &arg, arg.iov[0].iov_len);
779 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
780 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
783 #ifdef CONFIG_TCP_MD5SIG
789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
790 outside socket context is ugly, certainly. What can I do?
793 static void tcp_v4_send_ack(const struct sock *sk,
794 struct sk_buff *skb, u32 seq, u32 ack,
795 u32 win, u32 tsval, u32 tsecr, int oif,
796 struct tcp_md5sig_key *key,
797 int reply_flags, u8 tos)
799 const struct tcphdr *th = tcp_hdr(skb);
802 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
803 #ifdef CONFIG_TCP_MD5SIG
804 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
808 struct net *net = sock_net(sk);
809 struct ip_reply_arg arg;
812 memset(&rep.th, 0, sizeof(struct tcphdr));
813 memset(&arg, 0, sizeof(arg));
815 arg.iov[0].iov_base = (unsigned char *)&rep;
816 arg.iov[0].iov_len = sizeof(rep.th);
818 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
819 (TCPOPT_TIMESTAMP << 8) |
821 rep.opt[1] = htonl(tsval);
822 rep.opt[2] = htonl(tsecr);
823 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
826 /* Swap the send and the receive. */
827 rep.th.dest = th->source;
828 rep.th.source = th->dest;
829 rep.th.doff = arg.iov[0].iov_len / 4;
830 rep.th.seq = htonl(seq);
831 rep.th.ack_seq = htonl(ack);
833 rep.th.window = htons(win);
835 #ifdef CONFIG_TCP_MD5SIG
837 int offset = (tsecr) ? 3 : 0;
839 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
841 (TCPOPT_MD5SIG << 8) |
843 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
844 rep.th.doff = arg.iov[0].iov_len/4;
846 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
847 key, ip_hdr(skb)->saddr,
848 ip_hdr(skb)->daddr, &rep.th);
851 arg.flags = reply_flags;
852 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
853 ip_hdr(skb)->saddr, /* XXX */
854 arg.iov[0].iov_len, IPPROTO_TCP, 0);
855 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
857 arg.bound_dev_if = oif;
859 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
861 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
863 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
864 inet_twsk(sk)->tw_mark : sk->sk_mark;
865 ip_send_unicast_reply(ctl_sk,
866 skb, &TCP_SKB_CB(skb)->header.h4.opt,
867 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
868 &arg, arg.iov[0].iov_len);
871 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
877 struct inet_timewait_sock *tw = inet_twsk(sk);
878 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
880 tcp_v4_send_ack(sk, skb,
881 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
882 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
883 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
886 tcp_twsk_md5_key(tcptw),
887 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
895 struct request_sock *req)
897 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
898 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
900 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
904 * The window field (SEG.WND) of every outgoing segment, with the
905 * exception of <SYN> segments, MUST be right-shifted by
906 * Rcv.Wind.Shift bits:
908 tcp_v4_send_ack(sk, skb, seq,
909 tcp_rsk(req)->rcv_nxt,
910 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
911 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
914 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
916 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
921 * Send a SYN-ACK after having received a SYN.
922 * This still operates on a request_sock only, not on a big
925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
927 struct request_sock *req,
928 struct tcp_fastopen_cookie *foc,
929 enum tcp_synack_type synack_type)
931 const struct inet_request_sock *ireq = inet_rsk(req);
936 /* First, grab a route. */
937 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
940 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
943 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
945 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
947 ireq_opt_deref(ireq));
948 err = net_xmit_eval(err);
955 * IPv4 request_sock destructor.
957 static void tcp_v4_reqsk_destructor(struct request_sock *req)
959 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
962 #ifdef CONFIG_TCP_MD5SIG
964 * RFC2385 MD5 checksumming requires a mapping of
965 * IP address->MD5 Key.
966 * We need to maintain these in the sk structure.
969 /* Find the Key structure for an address. */
970 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
971 const union tcp_md5_addr *addr,
974 const struct tcp_sock *tp = tcp_sk(sk);
975 struct tcp_md5sig_key *key;
976 const struct tcp_md5sig_info *md5sig;
978 struct tcp_md5sig_key *best_match = NULL;
981 /* caller either holds rcu_read_lock() or socket lock */
982 md5sig = rcu_dereference_check(tp->md5sig_info,
983 lockdep_sock_is_held(sk));
987 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
988 if (key->family != family)
991 if (family == AF_INET) {
992 mask = inet_make_mask(key->prefixlen);
993 match = (key->addr.a4.s_addr & mask) ==
994 (addr->a4.s_addr & mask);
995 #if IS_ENABLED(CONFIG_IPV6)
996 } else if (family == AF_INET6) {
997 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1004 if (match && (!best_match ||
1005 key->prefixlen > best_match->prefixlen))
1010 EXPORT_SYMBOL(tcp_md5_do_lookup);
1012 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1013 const union tcp_md5_addr *addr,
1014 int family, u8 prefixlen)
1016 const struct tcp_sock *tp = tcp_sk(sk);
1017 struct tcp_md5sig_key *key;
1018 unsigned int size = sizeof(struct in_addr);
1019 const struct tcp_md5sig_info *md5sig;
1021 /* caller either holds rcu_read_lock() or socket lock */
1022 md5sig = rcu_dereference_check(tp->md5sig_info,
1023 lockdep_sock_is_held(sk));
1026 #if IS_ENABLED(CONFIG_IPV6)
1027 if (family == AF_INET6)
1028 size = sizeof(struct in6_addr);
1030 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1031 if (key->family != family)
1033 if (!memcmp(&key->addr, addr, size) &&
1034 key->prefixlen == prefixlen)
1040 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1041 const struct sock *addr_sk)
1043 const union tcp_md5_addr *addr;
1045 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1046 return tcp_md5_do_lookup(sk, addr, AF_INET);
1048 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1050 /* This can be called on a newly created socket, from other files */
1051 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1052 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1055 /* Add Key to the list */
1056 struct tcp_md5sig_key *key;
1057 struct tcp_sock *tp = tcp_sk(sk);
1058 struct tcp_md5sig_info *md5sig;
1060 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1062 /* Pre-existing entry - just update that one. */
1063 memcpy(key->key, newkey, newkeylen);
1064 key->keylen = newkeylen;
1068 md5sig = rcu_dereference_protected(tp->md5sig_info,
1069 lockdep_sock_is_held(sk));
1071 md5sig = kmalloc(sizeof(*md5sig), gfp);
1075 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1076 INIT_HLIST_HEAD(&md5sig->head);
1077 rcu_assign_pointer(tp->md5sig_info, md5sig);
1080 key = sock_kmalloc(sk, sizeof(*key), gfp);
1083 if (!tcp_alloc_md5sig_pool()) {
1084 sock_kfree_s(sk, key, sizeof(*key));
1088 memcpy(key->key, newkey, newkeylen);
1089 key->keylen = newkeylen;
1090 key->family = family;
1091 key->prefixlen = prefixlen;
1092 memcpy(&key->addr, addr,
1093 (family == AF_INET6) ? sizeof(struct in6_addr) :
1094 sizeof(struct in_addr));
1095 hlist_add_head_rcu(&key->node, &md5sig->head);
1098 EXPORT_SYMBOL(tcp_md5_do_add);
1100 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1103 struct tcp_md5sig_key *key;
1105 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1108 hlist_del_rcu(&key->node);
1109 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1110 kfree_rcu(key, rcu);
1113 EXPORT_SYMBOL(tcp_md5_do_del);
1115 static void tcp_clear_md5_list(struct sock *sk)
1117 struct tcp_sock *tp = tcp_sk(sk);
1118 struct tcp_md5sig_key *key;
1119 struct hlist_node *n;
1120 struct tcp_md5sig_info *md5sig;
1122 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1124 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1125 hlist_del_rcu(&key->node);
1126 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1127 kfree_rcu(key, rcu);
1131 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1132 char __user *optval, int optlen)
1134 struct tcp_md5sig cmd;
1135 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1138 if (optlen < sizeof(cmd))
1141 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1144 if (sin->sin_family != AF_INET)
1147 if (optname == TCP_MD5SIG_EXT &&
1148 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1149 prefixlen = cmd.tcpm_prefixlen;
1154 if (!cmd.tcpm_keylen)
1155 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1156 AF_INET, prefixlen);
1158 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1161 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1162 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1166 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1167 __be32 daddr, __be32 saddr,
1168 const struct tcphdr *th, int nbytes)
1170 struct tcp4_pseudohdr *bp;
1171 struct scatterlist sg;
1178 bp->protocol = IPPROTO_TCP;
1179 bp->len = cpu_to_be16(nbytes);
1181 _th = (struct tcphdr *)(bp + 1);
1182 memcpy(_th, th, sizeof(*th));
1185 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1186 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1187 sizeof(*bp) + sizeof(*th));
1188 return crypto_ahash_update(hp->md5_req);
1191 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1192 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1194 struct tcp_md5sig_pool *hp;
1195 struct ahash_request *req;
1197 hp = tcp_get_md5sig_pool();
1199 goto clear_hash_noput;
1202 if (crypto_ahash_init(req))
1204 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1206 if (tcp_md5_hash_key(hp, key))
1208 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1209 if (crypto_ahash_final(req))
1212 tcp_put_md5sig_pool();
1216 tcp_put_md5sig_pool();
1218 memset(md5_hash, 0, 16);
1222 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1223 const struct sock *sk,
1224 const struct sk_buff *skb)
1226 struct tcp_md5sig_pool *hp;
1227 struct ahash_request *req;
1228 const struct tcphdr *th = tcp_hdr(skb);
1229 __be32 saddr, daddr;
1231 if (sk) { /* valid for establish/request sockets */
1232 saddr = sk->sk_rcv_saddr;
1233 daddr = sk->sk_daddr;
1235 const struct iphdr *iph = ip_hdr(skb);
1240 hp = tcp_get_md5sig_pool();
1242 goto clear_hash_noput;
1245 if (crypto_ahash_init(req))
1248 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1250 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1252 if (tcp_md5_hash_key(hp, key))
1254 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1255 if (crypto_ahash_final(req))
1258 tcp_put_md5sig_pool();
1262 tcp_put_md5sig_pool();
1264 memset(md5_hash, 0, 16);
1267 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1271 /* Called with rcu_read_lock() */
1272 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1273 const struct sk_buff *skb)
1275 #ifdef CONFIG_TCP_MD5SIG
1277 * This gets called for each TCP segment that arrives
1278 * so we want to be efficient.
1279 * We have 3 drop cases:
1280 * o No MD5 hash and one expected.
1281 * o MD5 hash and we're not expecting one.
1282 * o MD5 hash and its wrong.
1284 const __u8 *hash_location = NULL;
1285 struct tcp_md5sig_key *hash_expected;
1286 const struct iphdr *iph = ip_hdr(skb);
1287 const struct tcphdr *th = tcp_hdr(skb);
1289 unsigned char newhash[16];
1291 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1293 hash_location = tcp_parse_md5sig_option(th);
1295 /* We've parsed the options - do we have a hash? */
1296 if (!hash_expected && !hash_location)
1299 if (hash_expected && !hash_location) {
1300 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1304 if (!hash_expected && hash_location) {
1305 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1309 /* Okay, so this is hash_expected and hash_location -
1310 * so we need to calculate the checksum.
1312 genhash = tcp_v4_md5_hash_skb(newhash,
1316 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1317 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1318 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1319 &iph->saddr, ntohs(th->source),
1320 &iph->daddr, ntohs(th->dest),
1321 genhash ? " tcp_v4_calc_md5_hash failed"
1330 static void tcp_v4_init_req(struct request_sock *req,
1331 const struct sock *sk_listener,
1332 struct sk_buff *skb)
1334 struct inet_request_sock *ireq = inet_rsk(req);
1335 struct net *net = sock_net(sk_listener);
1337 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1338 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1339 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1342 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1344 const struct request_sock *req)
1346 return inet_csk_route_req(sk, &fl->u.ip4, req);
1349 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1351 .obj_size = sizeof(struct tcp_request_sock),
1352 .rtx_syn_ack = tcp_rtx_synack,
1353 .send_ack = tcp_v4_reqsk_send_ack,
1354 .destructor = tcp_v4_reqsk_destructor,
1355 .send_reset = tcp_v4_send_reset,
1356 .syn_ack_timeout = tcp_syn_ack_timeout,
1359 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1360 .mss_clamp = TCP_MSS_DEFAULT,
1361 #ifdef CONFIG_TCP_MD5SIG
1362 .req_md5_lookup = tcp_v4_md5_lookup,
1363 .calc_md5_hash = tcp_v4_md5_hash_skb,
1365 .init_req = tcp_v4_init_req,
1366 #ifdef CONFIG_SYN_COOKIES
1367 .cookie_init_seq = cookie_v4_init_sequence,
1369 .route_req = tcp_v4_route_req,
1370 .init_seq = tcp_v4_init_seq,
1371 .init_ts_off = tcp_v4_init_ts_off,
1372 .send_synack = tcp_v4_send_synack,
1375 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1377 /* Never answer to SYNs send to broadcast or multicast */
1378 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1381 return tcp_conn_request(&tcp_request_sock_ops,
1382 &tcp_request_sock_ipv4_ops, sk, skb);
1388 EXPORT_SYMBOL(tcp_v4_conn_request);
1392 * The three way handshake has completed - we got a valid synack -
1393 * now create the new socket.
1395 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1396 struct request_sock *req,
1397 struct dst_entry *dst,
1398 struct request_sock *req_unhash,
1401 struct inet_request_sock *ireq;
1402 struct inet_sock *newinet;
1403 struct tcp_sock *newtp;
1405 #ifdef CONFIG_TCP_MD5SIG
1406 struct tcp_md5sig_key *key;
1408 struct ip_options_rcu *inet_opt;
1410 if (sk_acceptq_is_full(sk))
1413 newsk = tcp_create_openreq_child(sk, req, skb);
1417 newsk->sk_gso_type = SKB_GSO_TCPV4;
1418 inet_sk_rx_dst_set(newsk, skb);
1420 newtp = tcp_sk(newsk);
1421 newinet = inet_sk(newsk);
1422 ireq = inet_rsk(req);
1423 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1424 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1425 newsk->sk_bound_dev_if = ireq->ir_iif;
1426 newinet->inet_saddr = ireq->ir_loc_addr;
1427 inet_opt = rcu_dereference(ireq->ireq_opt);
1428 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1429 newinet->mc_index = inet_iif(skb);
1430 newinet->mc_ttl = ip_hdr(skb)->ttl;
1431 newinet->rcv_tos = ip_hdr(skb)->tos;
1432 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1434 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1435 newinet->inet_id = newtp->write_seq ^ jiffies;
1438 dst = inet_csk_route_child_sock(sk, newsk, req);
1442 /* syncookie case : see end of cookie_v4_check() */
1444 sk_setup_caps(newsk, dst);
1446 tcp_ca_openreq_child(newsk, dst);
1448 tcp_sync_mss(newsk, dst_mtu(dst));
1449 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1451 tcp_initialize_rcv_mss(newsk);
1453 #ifdef CONFIG_TCP_MD5SIG
1454 /* Copy over the MD5 key from the original socket */
1455 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1459 * We're using one, so create a matching key
1460 * on the newsk structure. If we fail to get
1461 * memory, then we end up not copying the key
1464 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1465 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1466 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1470 if (__inet_inherit_port(sk, newsk) < 0)
1472 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1473 if (likely(*own_req)) {
1474 tcp_move_syn(newtp, req);
1475 ireq->ireq_opt = NULL;
1477 newinet->inet_opt = NULL;
1482 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1489 newinet->inet_opt = NULL;
1490 inet_csk_prepare_forced_close(newsk);
1494 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1496 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1498 #ifdef CONFIG_SYN_COOKIES
1499 const struct tcphdr *th = tcp_hdr(skb);
1502 sk = cookie_v4_check(sk, skb);
1507 /* The socket must have it's spinlock held when we get
1508 * here, unless it is a TCP_LISTEN socket.
1510 * We have a potential double-lock case here, so even when
1511 * doing backlog processing we use the BH locking scheme.
1512 * This is because we cannot sleep with the original spinlock
1515 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1519 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1520 struct dst_entry *dst = sk->sk_rx_dst;
1522 sock_rps_save_rxhash(sk, skb);
1523 sk_mark_napi_id(sk, skb);
1525 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1526 !dst->ops->check(dst, 0)) {
1528 sk->sk_rx_dst = NULL;
1531 tcp_rcv_established(sk, skb);
1535 if (tcp_checksum_complete(skb))
1538 if (sk->sk_state == TCP_LISTEN) {
1539 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1544 if (tcp_child_process(sk, nsk, skb)) {
1551 sock_rps_save_rxhash(sk, skb);
1553 if (tcp_rcv_state_process(sk, skb)) {
1560 tcp_v4_send_reset(rsk, skb);
1563 /* Be careful here. If this function gets more complicated and
1564 * gcc suffers from register pressure on the x86, sk (in %ebx)
1565 * might be destroyed here. This current version compiles correctly,
1566 * but you have been warned.
1571 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1572 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1575 EXPORT_SYMBOL(tcp_v4_do_rcv);
1577 int tcp_v4_early_demux(struct sk_buff *skb)
1579 const struct iphdr *iph;
1580 const struct tcphdr *th;
1583 if (skb->pkt_type != PACKET_HOST)
1586 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1592 if (th->doff < sizeof(struct tcphdr) / 4)
1595 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1596 iph->saddr, th->source,
1597 iph->daddr, ntohs(th->dest),
1598 skb->skb_iif, inet_sdif(skb));
1601 skb->destructor = sock_edemux;
1602 if (sk_fullsock(sk)) {
1603 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1606 dst = dst_check(dst, 0);
1608 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1609 skb_dst_set_noref(skb, dst);
1615 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1617 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1619 /* Only socket owner can try to collapse/prune rx queues
1620 * to reduce memory overhead, so add a little headroom here.
1621 * Few sockets backlog are possibly concurrently non empty.
1625 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1626 * we can fix skb->truesize to its real value to avoid future drops.
1627 * This is valid because skb is not yet charged to the socket.
1628 * It has been noticed pure SACK packets were sometimes dropped
1629 * (if cooked by drivers without copybreak feature).
1633 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1635 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1640 EXPORT_SYMBOL(tcp_add_backlog);
1642 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1644 struct tcphdr *th = (struct tcphdr *)skb->data;
1645 unsigned int eaten = skb->len;
1648 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1651 TCP_SKB_CB(skb)->end_seq -= eaten;
1655 EXPORT_SYMBOL(tcp_filter);
1657 static void tcp_v4_restore_cb(struct sk_buff *skb)
1659 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1660 sizeof(struct inet_skb_parm));
1663 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1664 const struct tcphdr *th)
1666 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1667 * barrier() makes sure compiler wont play fool^Waliasing games.
1669 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1670 sizeof(struct inet_skb_parm));
1673 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1674 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1675 skb->len - th->doff * 4);
1676 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1677 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1678 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1679 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1680 TCP_SKB_CB(skb)->sacked = 0;
1681 TCP_SKB_CB(skb)->has_rxtstamp =
1682 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1689 int tcp_v4_rcv(struct sk_buff *skb)
1691 struct net *net = dev_net(skb->dev);
1692 int sdif = inet_sdif(skb);
1693 const struct iphdr *iph;
1694 const struct tcphdr *th;
1699 if (skb->pkt_type != PACKET_HOST)
1702 /* Count it even if it's bad */
1703 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1705 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1708 th = (const struct tcphdr *)skb->data;
1710 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1712 if (!pskb_may_pull(skb, th->doff * 4))
1715 /* An explanation is required here, I think.
1716 * Packet length and doff are validated by header prediction,
1717 * provided case of th->doff==0 is eliminated.
1718 * So, we defer the checks. */
1720 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1723 th = (const struct tcphdr *)skb->data;
1726 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1727 th->dest, sdif, &refcounted);
1732 if (sk->sk_state == TCP_TIME_WAIT)
1735 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1736 struct request_sock *req = inet_reqsk(sk);
1737 bool req_stolen = false;
1740 sk = req->rsk_listener;
1741 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1742 sk_drops_add(sk, skb);
1746 if (tcp_checksum_complete(skb)) {
1750 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1751 inet_csk_reqsk_queue_drop_and_put(sk, req);
1754 /* We own a reference on the listener, increase it again
1755 * as we might lose it too soon.
1760 if (!tcp_filter(sk, skb)) {
1761 th = (const struct tcphdr *)skb->data;
1763 tcp_v4_fill_cb(skb, iph, th);
1764 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1769 /* Another cpu got exclusive access to req
1770 * and created a full blown socket.
1771 * Try to feed this packet to this socket
1772 * instead of discarding it.
1774 tcp_v4_restore_cb(skb);
1778 goto discard_and_relse;
1782 tcp_v4_restore_cb(skb);
1783 } else if (tcp_child_process(sk, nsk, skb)) {
1784 tcp_v4_send_reset(nsk, skb);
1785 goto discard_and_relse;
1791 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1792 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1793 goto discard_and_relse;
1796 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1797 goto discard_and_relse;
1799 if (tcp_v4_inbound_md5_hash(sk, skb))
1800 goto discard_and_relse;
1804 if (tcp_filter(sk, skb))
1805 goto discard_and_relse;
1806 th = (const struct tcphdr *)skb->data;
1808 tcp_v4_fill_cb(skb, iph, th);
1812 if (sk->sk_state == TCP_LISTEN) {
1813 ret = tcp_v4_do_rcv(sk, skb);
1814 goto put_and_return;
1817 sk_incoming_cpu_update(sk);
1819 bh_lock_sock_nested(sk);
1820 tcp_segs_in(tcp_sk(sk), skb);
1822 if (!sock_owned_by_user(sk)) {
1823 ret = tcp_v4_do_rcv(sk, skb);
1824 } else if (tcp_add_backlog(sk, skb)) {
1825 goto discard_and_relse;
1836 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1839 tcp_v4_fill_cb(skb, iph, th);
1841 if (tcp_checksum_complete(skb)) {
1843 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1845 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1847 tcp_v4_send_reset(NULL, skb);
1851 /* Discard frame. */
1856 sk_drops_add(sk, skb);
1862 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1863 inet_twsk_put(inet_twsk(sk));
1867 tcp_v4_fill_cb(skb, iph, th);
1869 if (tcp_checksum_complete(skb)) {
1870 inet_twsk_put(inet_twsk(sk));
1873 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1875 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1878 iph->saddr, th->source,
1879 iph->daddr, th->dest,
1883 inet_twsk_deschedule_put(inet_twsk(sk));
1885 tcp_v4_restore_cb(skb);
1893 tcp_v4_timewait_ack(sk, skb);
1896 tcp_v4_send_reset(sk, skb);
1897 inet_twsk_deschedule_put(inet_twsk(sk));
1899 case TCP_TW_SUCCESS:;
1904 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1905 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1906 .twsk_unique = tcp_twsk_unique,
1907 .twsk_destructor= tcp_twsk_destructor,
1910 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1912 struct dst_entry *dst = skb_dst(skb);
1914 if (dst && dst_hold_safe(dst)) {
1915 sk->sk_rx_dst = dst;
1916 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1919 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1921 const struct inet_connection_sock_af_ops ipv4_specific = {
1922 .queue_xmit = ip_queue_xmit,
1923 .send_check = tcp_v4_send_check,
1924 .rebuild_header = inet_sk_rebuild_header,
1925 .sk_rx_dst_set = inet_sk_rx_dst_set,
1926 .conn_request = tcp_v4_conn_request,
1927 .syn_recv_sock = tcp_v4_syn_recv_sock,
1928 .net_header_len = sizeof(struct iphdr),
1929 .setsockopt = ip_setsockopt,
1930 .getsockopt = ip_getsockopt,
1931 .addr2sockaddr = inet_csk_addr2sockaddr,
1932 .sockaddr_len = sizeof(struct sockaddr_in),
1933 #ifdef CONFIG_COMPAT
1934 .compat_setsockopt = compat_ip_setsockopt,
1935 .compat_getsockopt = compat_ip_getsockopt,
1937 .mtu_reduced = tcp_v4_mtu_reduced,
1939 EXPORT_SYMBOL(ipv4_specific);
1941 #ifdef CONFIG_TCP_MD5SIG
1942 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1943 .md5_lookup = tcp_v4_md5_lookup,
1944 .calc_md5_hash = tcp_v4_md5_hash_skb,
1945 .md5_parse = tcp_v4_parse_md5_keys,
1949 /* NOTE: A lot of things set to zero explicitly by call to
1950 * sk_alloc() so need not be done here.
1952 static int tcp_v4_init_sock(struct sock *sk)
1954 struct inet_connection_sock *icsk = inet_csk(sk);
1958 icsk->icsk_af_ops = &ipv4_specific;
1960 #ifdef CONFIG_TCP_MD5SIG
1961 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1967 void tcp_v4_destroy_sock(struct sock *sk)
1969 struct tcp_sock *tp = tcp_sk(sk);
1971 trace_tcp_destroy_sock(sk);
1973 tcp_clear_xmit_timers(sk);
1975 tcp_cleanup_congestion_control(sk);
1977 tcp_cleanup_ulp(sk);
1979 /* Cleanup up the write buffer. */
1980 tcp_write_queue_purge(sk);
1982 /* Check if we want to disable active TFO */
1983 tcp_fastopen_active_disable_ofo_check(sk);
1985 /* Cleans up our, hopefully empty, out_of_order_queue. */
1986 skb_rbtree_purge(&tp->out_of_order_queue);
1988 #ifdef CONFIG_TCP_MD5SIG
1989 /* Clean up the MD5 key list, if any */
1990 if (tp->md5sig_info) {
1991 tcp_clear_md5_list(sk);
1992 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1993 tp->md5sig_info = NULL;
1997 /* Clean up a referenced TCP bind bucket. */
1998 if (inet_csk(sk)->icsk_bind_hash)
2001 BUG_ON(tp->fastopen_rsk);
2003 /* If socket is aborted during connect operation */
2004 tcp_free_fastopen_req(tp);
2005 tcp_fastopen_destroy_cipher(sk);
2006 tcp_saved_syn_free(tp);
2008 sk_sockets_allocated_dec(sk);
2010 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2012 #ifdef CONFIG_PROC_FS
2013 /* Proc filesystem TCP sock list dumping. */
2016 * Get next listener socket follow cur. If cur is NULL, get first socket
2017 * starting from bucket given in st->bucket; when st->bucket is zero the
2018 * very first socket in the hash table is returned.
2020 static void *listening_get_next(struct seq_file *seq, void *cur)
2022 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2023 struct tcp_iter_state *st = seq->private;
2024 struct net *net = seq_file_net(seq);
2025 struct inet_listen_hashbucket *ilb;
2026 struct sock *sk = cur;
2030 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2031 spin_lock(&ilb->lock);
2032 sk = sk_head(&ilb->head);
2036 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2042 sk_for_each_from(sk) {
2043 if (!net_eq(sock_net(sk), net))
2045 if (sk->sk_family == afinfo->family)
2048 spin_unlock(&ilb->lock);
2050 if (++st->bucket < INET_LHTABLE_SIZE)
2055 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2057 struct tcp_iter_state *st = seq->private;
2062 rc = listening_get_next(seq, NULL);
2064 while (rc && *pos) {
2065 rc = listening_get_next(seq, rc);
2071 static inline bool empty_bucket(const struct tcp_iter_state *st)
2073 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2077 * Get first established socket starting from bucket given in st->bucket.
2078 * If st->bucket is zero, the very first socket in the hash is returned.
2080 static void *established_get_first(struct seq_file *seq)
2082 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2083 struct tcp_iter_state *st = seq->private;
2084 struct net *net = seq_file_net(seq);
2088 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2090 struct hlist_nulls_node *node;
2091 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2093 /* Lockless fast path for the common case of empty buckets */
2094 if (empty_bucket(st))
2098 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2099 if (sk->sk_family != afinfo->family ||
2100 !net_eq(sock_net(sk), net)) {
2106 spin_unlock_bh(lock);
2112 static void *established_get_next(struct seq_file *seq, void *cur)
2114 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2115 struct sock *sk = cur;
2116 struct hlist_nulls_node *node;
2117 struct tcp_iter_state *st = seq->private;
2118 struct net *net = seq_file_net(seq);
2123 sk = sk_nulls_next(sk);
2125 sk_nulls_for_each_from(sk, node) {
2126 if (sk->sk_family == afinfo->family &&
2127 net_eq(sock_net(sk), net))
2131 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2133 return established_get_first(seq);
2136 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2138 struct tcp_iter_state *st = seq->private;
2142 rc = established_get_first(seq);
2145 rc = established_get_next(seq, rc);
2151 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2154 struct tcp_iter_state *st = seq->private;
2156 st->state = TCP_SEQ_STATE_LISTENING;
2157 rc = listening_get_idx(seq, &pos);
2160 st->state = TCP_SEQ_STATE_ESTABLISHED;
2161 rc = established_get_idx(seq, pos);
2167 static void *tcp_seek_last_pos(struct seq_file *seq)
2169 struct tcp_iter_state *st = seq->private;
2170 int offset = st->offset;
2171 int orig_num = st->num;
2174 switch (st->state) {
2175 case TCP_SEQ_STATE_LISTENING:
2176 if (st->bucket >= INET_LHTABLE_SIZE)
2178 st->state = TCP_SEQ_STATE_LISTENING;
2179 rc = listening_get_next(seq, NULL);
2180 while (offset-- && rc)
2181 rc = listening_get_next(seq, rc);
2185 st->state = TCP_SEQ_STATE_ESTABLISHED;
2187 case TCP_SEQ_STATE_ESTABLISHED:
2188 if (st->bucket > tcp_hashinfo.ehash_mask)
2190 rc = established_get_first(seq);
2191 while (offset-- && rc)
2192 rc = established_get_next(seq, rc);
2200 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2202 struct tcp_iter_state *st = seq->private;
2205 if (*pos && *pos == st->last_pos) {
2206 rc = tcp_seek_last_pos(seq);
2211 st->state = TCP_SEQ_STATE_LISTENING;
2215 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2218 st->last_pos = *pos;
2221 EXPORT_SYMBOL(tcp_seq_start);
2223 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2225 struct tcp_iter_state *st = seq->private;
2228 if (v == SEQ_START_TOKEN) {
2229 rc = tcp_get_idx(seq, 0);
2233 switch (st->state) {
2234 case TCP_SEQ_STATE_LISTENING:
2235 rc = listening_get_next(seq, v);
2237 st->state = TCP_SEQ_STATE_ESTABLISHED;
2240 rc = established_get_first(seq);
2243 case TCP_SEQ_STATE_ESTABLISHED:
2244 rc = established_get_next(seq, v);
2249 st->last_pos = *pos;
2252 EXPORT_SYMBOL(tcp_seq_next);
2254 void tcp_seq_stop(struct seq_file *seq, void *v)
2256 struct tcp_iter_state *st = seq->private;
2258 switch (st->state) {
2259 case TCP_SEQ_STATE_LISTENING:
2260 if (v != SEQ_START_TOKEN)
2261 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2263 case TCP_SEQ_STATE_ESTABLISHED:
2265 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2269 EXPORT_SYMBOL(tcp_seq_stop);
2271 static void get_openreq4(const struct request_sock *req,
2272 struct seq_file *f, int i)
2274 const struct inet_request_sock *ireq = inet_rsk(req);
2275 long delta = req->rsk_timer.expires - jiffies;
2277 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2278 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2283 ntohs(ireq->ir_rmt_port),
2285 0, 0, /* could print option size, but that is af dependent. */
2286 1, /* timers active (only the expire timer) */
2287 jiffies_delta_to_clock_t(delta),
2289 from_kuid_munged(seq_user_ns(f),
2290 sock_i_uid(req->rsk_listener)),
2291 0, /* non standard timer */
2292 0, /* open_requests have no inode */
2297 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2300 unsigned long timer_expires;
2301 const struct tcp_sock *tp = tcp_sk(sk);
2302 const struct inet_connection_sock *icsk = inet_csk(sk);
2303 const struct inet_sock *inet = inet_sk(sk);
2304 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2305 __be32 dest = inet->inet_daddr;
2306 __be32 src = inet->inet_rcv_saddr;
2307 __u16 destp = ntohs(inet->inet_dport);
2308 __u16 srcp = ntohs(inet->inet_sport);
2312 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2313 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2314 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2316 timer_expires = icsk->icsk_timeout;
2317 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2319 timer_expires = icsk->icsk_timeout;
2320 } else if (timer_pending(&sk->sk_timer)) {
2322 timer_expires = sk->sk_timer.expires;
2325 timer_expires = jiffies;
2328 state = inet_sk_state_load(sk);
2329 if (state == TCP_LISTEN)
2330 rx_queue = sk->sk_ack_backlog;
2332 /* Because we don't lock the socket,
2333 * we might find a transient negative value.
2335 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2337 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2338 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2339 i, src, srcp, dest, destp, state,
2340 tp->write_seq - tp->snd_una,
2343 jiffies_delta_to_clock_t(timer_expires - jiffies),
2344 icsk->icsk_retransmits,
2345 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2346 icsk->icsk_probes_out,
2348 refcount_read(&sk->sk_refcnt), sk,
2349 jiffies_to_clock_t(icsk->icsk_rto),
2350 jiffies_to_clock_t(icsk->icsk_ack.ato),
2351 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2353 state == TCP_LISTEN ?
2354 fastopenq->max_qlen :
2355 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2358 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2359 struct seq_file *f, int i)
2361 long delta = tw->tw_timer.expires - jiffies;
2365 dest = tw->tw_daddr;
2366 src = tw->tw_rcv_saddr;
2367 destp = ntohs(tw->tw_dport);
2368 srcp = ntohs(tw->tw_sport);
2370 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2371 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2372 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2373 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2374 refcount_read(&tw->tw_refcnt), tw);
2379 static int tcp4_seq_show(struct seq_file *seq, void *v)
2381 struct tcp_iter_state *st;
2382 struct sock *sk = v;
2384 seq_setwidth(seq, TMPSZ - 1);
2385 if (v == SEQ_START_TOKEN) {
2386 seq_puts(seq, " sl local_address rem_address st tx_queue "
2387 "rx_queue tr tm->when retrnsmt uid timeout "
2393 if (sk->sk_state == TCP_TIME_WAIT)
2394 get_timewait4_sock(v, seq, st->num);
2395 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2396 get_openreq4(v, seq, st->num);
2398 get_tcp4_sock(v, seq, st->num);
2404 static const struct seq_operations tcp4_seq_ops = {
2405 .show = tcp4_seq_show,
2406 .start = tcp_seq_start,
2407 .next = tcp_seq_next,
2408 .stop = tcp_seq_stop,
2411 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2415 static int __net_init tcp4_proc_init_net(struct net *net)
2417 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2418 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2423 static void __net_exit tcp4_proc_exit_net(struct net *net)
2425 remove_proc_entry("tcp", net->proc_net);
2428 static struct pernet_operations tcp4_net_ops = {
2429 .init = tcp4_proc_init_net,
2430 .exit = tcp4_proc_exit_net,
2433 int __init tcp4_proc_init(void)
2435 return register_pernet_subsys(&tcp4_net_ops);
2438 void tcp4_proc_exit(void)
2440 unregister_pernet_subsys(&tcp4_net_ops);
2442 #endif /* CONFIG_PROC_FS */
2444 struct proto tcp_prot = {
2446 .owner = THIS_MODULE,
2448 .pre_connect = tcp_v4_pre_connect,
2449 .connect = tcp_v4_connect,
2450 .disconnect = tcp_disconnect,
2451 .accept = inet_csk_accept,
2453 .init = tcp_v4_init_sock,
2454 .destroy = tcp_v4_destroy_sock,
2455 .shutdown = tcp_shutdown,
2456 .setsockopt = tcp_setsockopt,
2457 .getsockopt = tcp_getsockopt,
2458 .keepalive = tcp_set_keepalive,
2459 .recvmsg = tcp_recvmsg,
2460 .sendmsg = tcp_sendmsg,
2461 .sendpage = tcp_sendpage,
2462 .backlog_rcv = tcp_v4_do_rcv,
2463 .release_cb = tcp_release_cb,
2465 .unhash = inet_unhash,
2466 .get_port = inet_csk_get_port,
2467 .enter_memory_pressure = tcp_enter_memory_pressure,
2468 .leave_memory_pressure = tcp_leave_memory_pressure,
2469 .stream_memory_free = tcp_stream_memory_free,
2470 .sockets_allocated = &tcp_sockets_allocated,
2471 .orphan_count = &tcp_orphan_count,
2472 .memory_allocated = &tcp_memory_allocated,
2473 .memory_pressure = &tcp_memory_pressure,
2474 .sysctl_mem = sysctl_tcp_mem,
2475 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2476 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2477 .max_header = MAX_TCP_HEADER,
2478 .obj_size = sizeof(struct tcp_sock),
2479 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2480 .twsk_prot = &tcp_timewait_sock_ops,
2481 .rsk_prot = &tcp_request_sock_ops,
2482 .h.hashinfo = &tcp_hashinfo,
2483 .no_autobind = true,
2484 #ifdef CONFIG_COMPAT
2485 .compat_setsockopt = compat_tcp_setsockopt,
2486 .compat_getsockopt = compat_tcp_getsockopt,
2488 .diag_destroy = tcp_abort,
2490 EXPORT_SYMBOL(tcp_prot);
2492 static void __net_exit tcp_sk_exit(struct net *net)
2496 module_put(net->ipv4.tcp_congestion_control->owner);
2498 for_each_possible_cpu(cpu)
2499 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2500 free_percpu(net->ipv4.tcp_sk);
2503 static int __net_init tcp_sk_init(struct net *net)
2507 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2508 if (!net->ipv4.tcp_sk)
2511 for_each_possible_cpu(cpu) {
2514 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2518 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2519 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2522 net->ipv4.sysctl_tcp_ecn = 2;
2523 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2525 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2526 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2527 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2529 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2530 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2531 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2533 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2534 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2535 net->ipv4.sysctl_tcp_syncookies = 1;
2536 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2537 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2538 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2539 net->ipv4.sysctl_tcp_orphan_retries = 0;
2540 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2541 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2542 net->ipv4.sysctl_tcp_tw_reuse = 2;
2544 cnt = tcp_hashinfo.ehash_mask + 1;
2545 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2546 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2548 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2549 net->ipv4.sysctl_tcp_sack = 1;
2550 net->ipv4.sysctl_tcp_window_scaling = 1;
2551 net->ipv4.sysctl_tcp_timestamps = 1;
2552 net->ipv4.sysctl_tcp_early_retrans = 3;
2553 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2554 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2555 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2556 net->ipv4.sysctl_tcp_max_reordering = 300;
2557 net->ipv4.sysctl_tcp_dsack = 1;
2558 net->ipv4.sysctl_tcp_app_win = 31;
2559 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2560 net->ipv4.sysctl_tcp_frto = 2;
2561 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2562 /* This limits the percentage of the congestion window which we
2563 * will allow a single TSO frame to consume. Building TSO frames
2564 * which are too large can cause TCP streams to be bursty.
2566 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2567 /* Default TSQ limit of four TSO segments */
2568 net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2569 /* rfc5961 challenge ack rate limiting */
2570 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2571 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2572 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2573 net->ipv4.sysctl_tcp_autocorking = 1;
2574 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2575 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2576 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2577 if (net != &init_net) {
2578 memcpy(net->ipv4.sysctl_tcp_rmem,
2579 init_net.ipv4.sysctl_tcp_rmem,
2580 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2581 memcpy(net->ipv4.sysctl_tcp_wmem,
2582 init_net.ipv4.sysctl_tcp_wmem,
2583 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2585 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2586 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2587 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2588 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2589 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2590 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2592 /* Reno is always built in */
2593 if (!net_eq(net, &init_net) &&
2594 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2595 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2597 net->ipv4.tcp_congestion_control = &tcp_reno;
2606 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2610 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2612 list_for_each_entry(net, net_exit_list, exit_list)
2613 tcp_fastopen_ctx_destroy(net);
2616 static struct pernet_operations __net_initdata tcp_sk_ops = {
2617 .init = tcp_sk_init,
2618 .exit = tcp_sk_exit,
2619 .exit_batch = tcp_sk_exit_batch,
2622 void __init tcp_v4_init(void)
2624 if (register_pernet_subsys(&tcp_sk_ops))
2625 panic("Failed to create the TCP control socket.\n");