1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 tcp_hdr(skb)->source);
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
189 if (addr_len < sizeof(struct sockaddr_in))
192 sock_owned_by_me(sk);
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211 if (addr_len < sizeof(struct sockaddr_in))
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
223 nexthop = inet_opt->opt.faddr;
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 orig_sport, orig_dport, sk);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 if (!inet_opt || !inet_opt->opt.srr)
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
293 if (likely(!tp->repair)) {
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
305 inet->inet_id = prandom_u32();
307 if (tcp_fastopen_defer_connect(sk, &err))
312 err = tcp_connect(sk);
321 * This unhashes the socket and releases the local port,
324 tcp_set_state(sk, TCP_CLOSE);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
330 EXPORT_SYMBOL(tcp_v4_connect);
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
337 void tcp_v4_mtu_reduced(struct sock *sk)
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 mtu = tcp_sk(sk)->mtu_info;
346 dst = inet_csk_update_pmtu(sk, mtu);
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
378 dst->ops->redirect(dst, sk, skb);
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
405 EXPORT_SYMBOL(tcp_req_err);
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
416 if (sock_owned_by_user(sk))
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
442 tcp_retransmit_timer(sk);
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
472 struct request_sock *fastopen;
475 struct net *net = dev_net(skb->dev);
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 if (sk->sk_state == TCP_CLOSE)
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
534 case ICMP_PARAMETERPROB:
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
546 if (sk->sk_state == TCP_LISTEN)
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
559 err = icmp_err_convert[code].errno;
560 /* check if this ICMP message allows revert of backoff.
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
567 case ICMP_TIME_EXCEEDED:
574 switch (sk->sk_state) {
577 /* Only in fast or simultaneous open. If a fast open socket is
578 * already accepted it is treated as a connected one below.
580 if (fastopen && !fastopen->sk)
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
585 if (!sock_owned_by_user(sk)) {
588 sk->sk_error_report(sk);
592 sk->sk_err_soft = err;
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
609 * Now we are in compliance with RFCs.
614 if (!sock_owned_by_user(sk) && inet->recverr) {
616 sk->sk_error_report(sk);
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
629 struct tcphdr *th = tcp_hdr(skb);
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
639 const struct inet_sock *inet = inet_sk(sk);
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
643 EXPORT_SYMBOL(tcp_v4_send_check);
646 * This routine will send an RST to the other tcp.
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
660 const struct tcphdr *th = tcp_hdr(skb);
663 #ifdef CONFIG_TCP_MD5SIG
664 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
667 struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 struct tcp_md5sig_key *key = NULL;
670 const __u8 *hash_location = NULL;
671 unsigned char newhash[16];
673 struct sock *sk1 = NULL;
675 u64 transmit_time = 0;
679 /* Never send a reset in response to a reset. */
683 /* If sk not NULL, it means we did a successful lookup and incoming
684 * route had to be correct. prequeue might have dropped our dst.
686 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
689 /* Swap the send and the receive. */
690 memset(&rep, 0, sizeof(rep));
691 rep.th.dest = th->source;
692 rep.th.source = th->dest;
693 rep.th.doff = sizeof(struct tcphdr) / 4;
697 rep.th.seq = th->ack_seq;
700 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 skb->len - (th->doff << 2));
704 memset(&arg, 0, sizeof(arg));
705 arg.iov[0].iov_base = (unsigned char *)&rep;
706 arg.iov[0].iov_len = sizeof(rep.th);
708 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
711 hash_location = tcp_parse_md5sig_option(th);
712 if (sk && sk_fullsock(sk)) {
713 const union tcp_md5_addr *addr;
716 /* sdif set, means packet ingressed via a device
717 * in an L3 domain and inet_iif is set to it.
719 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 } else if (hash_location) {
723 const union tcp_md5_addr *addr;
724 int sdif = tcp_v4_sdif(skb);
725 int dif = inet_iif(skb);
729 * active side is lost. Try to find listening socket through
730 * source port, and then find md5 key through listening socket.
731 * we are not loose security here:
732 * Incoming packet is checked with md5 hash with finding key,
733 * no RST generated if md5 hash doesn't match.
735 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
737 th->source, ip_hdr(skb)->daddr,
738 ntohs(th->source), dif, sdif);
739 /* don't send rst if it can't find key */
743 /* sdif set, means packet ingressed via a device
744 * in an L3 domain and dif is set to it.
746 l3index = sdif ? dif : 0;
747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 if (genhash || memcmp(hash_location, newhash, 16) != 0)
760 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
762 (TCPOPT_MD5SIG << 8) |
764 /* Update length and the length the header thinks exists */
765 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 rep.th.doff = arg.iov[0].iov_len / 4;
768 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 key, ip_hdr(skb)->saddr,
770 ip_hdr(skb)->daddr, &rep.th);
773 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 ip_hdr(skb)->saddr, /* XXX */
775 arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
779 /* When socket is gone, all binding information is lost.
780 * routing might fail in this case. No choice here, if we choose to force
781 * input interface, we will misroute in case of asymmetric route.
784 arg.bound_dev_if = sk->sk_bound_dev_if;
786 trace_tcp_send_reset(sk, skb);
789 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
792 arg.tos = ip_hdr(skb)->tos;
793 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
795 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
797 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 inet_twsk(sk)->tw_mark : sk->sk_mark;
799 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 inet_twsk(sk)->tw_priority : sk->sk_priority;
801 transmit_time = tcp_transmit_time(sk);
803 ip_send_unicast_reply(ctl_sk,
804 skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 &arg, arg.iov[0].iov_len,
810 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
814 #ifdef CONFIG_TCP_MD5SIG
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821 outside socket context is ugly, certainly. What can I do?
824 static void tcp_v4_send_ack(const struct sock *sk,
825 struct sk_buff *skb, u32 seq, u32 ack,
826 u32 win, u32 tsval, u32 tsecr, int oif,
827 struct tcp_md5sig_key *key,
828 int reply_flags, u8 tos)
830 const struct tcphdr *th = tcp_hdr(skb);
833 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
839 struct net *net = sock_net(sk);
840 struct ip_reply_arg arg;
844 memset(&rep.th, 0, sizeof(struct tcphdr));
845 memset(&arg, 0, sizeof(arg));
847 arg.iov[0].iov_base = (unsigned char *)&rep;
848 arg.iov[0].iov_len = sizeof(rep.th);
850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 (TCPOPT_TIMESTAMP << 8) |
853 rep.opt[1] = htonl(tsval);
854 rep.opt[2] = htonl(tsecr);
855 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
858 /* Swap the send and the receive. */
859 rep.th.dest = th->source;
860 rep.th.source = th->dest;
861 rep.th.doff = arg.iov[0].iov_len / 4;
862 rep.th.seq = htonl(seq);
863 rep.th.ack_seq = htonl(ack);
865 rep.th.window = htons(win);
867 #ifdef CONFIG_TCP_MD5SIG
869 int offset = (tsecr) ? 3 : 0;
871 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
873 (TCPOPT_MD5SIG << 8) |
875 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 rep.th.doff = arg.iov[0].iov_len/4;
878 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 key, ip_hdr(skb)->saddr,
880 ip_hdr(skb)->daddr, &rep.th);
883 arg.flags = reply_flags;
884 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 ip_hdr(skb)->saddr, /* XXX */
886 arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
889 arg.bound_dev_if = oif;
891 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
893 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 inet_twsk(sk)->tw_mark : sk->sk_mark;
896 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 inet_twsk(sk)->tw_priority : sk->sk_priority;
898 transmit_time = tcp_transmit_time(sk);
899 ip_send_unicast_reply(ctl_sk,
900 skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 &arg, arg.iov[0].iov_len,
906 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
912 struct inet_timewait_sock *tw = inet_twsk(sk);
913 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
915 tcp_v4_send_ack(sk, skb,
916 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
921 tcp_twsk_md5_key(tcptw),
922 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 struct request_sock *req)
932 const union tcp_md5_addr *addr;
935 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
938 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
942 * The window field (SEG.WND) of every outgoing segment, with the
943 * exception of <SYN> segments, MUST be right-shifted by
944 * Rcv.Wind.Shift bits:
946 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 tcp_v4_send_ack(sk, skb, seq,
949 tcp_rsk(req)->rcv_nxt,
950 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
954 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
960 * Send a SYN-ACK after having received a SYN.
961 * This still operates on a request_sock only, not on a big
964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
966 struct request_sock *req,
967 struct tcp_fastopen_cookie *foc,
968 enum tcp_synack_type synack_type,
969 struct sk_buff *syn_skb)
971 const struct inet_request_sock *ireq = inet_rsk(req);
977 /* First, grab a route. */
978 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
981 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
984 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
986 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
987 tcp_rsk(req)->syn_tos & ~INET_ECN_MASK :
990 if (!INET_ECN_is_capable(tos) &&
991 tcp_bpf_ca_needs_ecn((struct sock *)req))
992 tos |= INET_ECN_ECT_0;
995 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
997 rcu_dereference(ireq->ireq_opt),
1000 err = net_xmit_eval(err);
1007 * IPv4 request_sock destructor.
1009 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1014 #ifdef CONFIG_TCP_MD5SIG
1016 * RFC2385 MD5 checksumming requires a mapping of
1017 * IP address->MD5 Key.
1018 * We need to maintain these in the sk structure.
1021 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1022 EXPORT_SYMBOL(tcp_md5_needed);
1024 /* Find the Key structure for an address. */
1025 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1026 const union tcp_md5_addr *addr,
1029 const struct tcp_sock *tp = tcp_sk(sk);
1030 struct tcp_md5sig_key *key;
1031 const struct tcp_md5sig_info *md5sig;
1033 struct tcp_md5sig_key *best_match = NULL;
1036 /* caller either holds rcu_read_lock() or socket lock */
1037 md5sig = rcu_dereference_check(tp->md5sig_info,
1038 lockdep_sock_is_held(sk));
1042 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1043 lockdep_sock_is_held(sk)) {
1044 if (key->family != family)
1046 if (key->l3index && key->l3index != l3index)
1048 if (family == AF_INET) {
1049 mask = inet_make_mask(key->prefixlen);
1050 match = (key->addr.a4.s_addr & mask) ==
1051 (addr->a4.s_addr & mask);
1052 #if IS_ENABLED(CONFIG_IPV6)
1053 } else if (family == AF_INET6) {
1054 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1061 if (match && (!best_match ||
1062 key->prefixlen > best_match->prefixlen))
1067 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1069 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1070 const union tcp_md5_addr *addr,
1071 int family, u8 prefixlen,
1074 const struct tcp_sock *tp = tcp_sk(sk);
1075 struct tcp_md5sig_key *key;
1076 unsigned int size = sizeof(struct in_addr);
1077 const struct tcp_md5sig_info *md5sig;
1079 /* caller either holds rcu_read_lock() or socket lock */
1080 md5sig = rcu_dereference_check(tp->md5sig_info,
1081 lockdep_sock_is_held(sk));
1084 #if IS_ENABLED(CONFIG_IPV6)
1085 if (family == AF_INET6)
1086 size = sizeof(struct in6_addr);
1088 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1089 lockdep_sock_is_held(sk)) {
1090 if (key->family != family)
1092 if (key->l3index && key->l3index != l3index)
1094 if (!memcmp(&key->addr, addr, size) &&
1095 key->prefixlen == prefixlen)
1101 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1102 const struct sock *addr_sk)
1104 const union tcp_md5_addr *addr;
1107 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1108 addr_sk->sk_bound_dev_if);
1109 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1110 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1112 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1114 /* This can be called on a newly created socket, from other files */
1115 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1116 int family, u8 prefixlen, int l3index,
1117 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1119 /* Add Key to the list */
1120 struct tcp_md5sig_key *key;
1121 struct tcp_sock *tp = tcp_sk(sk);
1122 struct tcp_md5sig_info *md5sig;
1124 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1126 /* Pre-existing entry - just update that one.
1127 * Note that the key might be used concurrently.
1128 * data_race() is telling kcsan that we do not care of
1129 * key mismatches, since changing MD5 key on live flows
1130 * can lead to packet drops.
1132 data_race(memcpy(key->key, newkey, newkeylen));
1134 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1135 * Also note that a reader could catch new key->keylen value
1136 * but old key->key[], this is the reason we use __GFP_ZERO
1137 * at sock_kmalloc() time below these lines.
1139 WRITE_ONCE(key->keylen, newkeylen);
1144 md5sig = rcu_dereference_protected(tp->md5sig_info,
1145 lockdep_sock_is_held(sk));
1147 md5sig = kmalloc(sizeof(*md5sig), gfp);
1151 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1152 INIT_HLIST_HEAD(&md5sig->head);
1153 rcu_assign_pointer(tp->md5sig_info, md5sig);
1156 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1159 if (!tcp_alloc_md5sig_pool()) {
1160 sock_kfree_s(sk, key, sizeof(*key));
1164 memcpy(key->key, newkey, newkeylen);
1165 key->keylen = newkeylen;
1166 key->family = family;
1167 key->prefixlen = prefixlen;
1168 key->l3index = l3index;
1169 memcpy(&key->addr, addr,
1170 (family == AF_INET6) ? sizeof(struct in6_addr) :
1171 sizeof(struct in_addr));
1172 hlist_add_head_rcu(&key->node, &md5sig->head);
1175 EXPORT_SYMBOL(tcp_md5_do_add);
1177 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1178 u8 prefixlen, int l3index)
1180 struct tcp_md5sig_key *key;
1182 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1185 hlist_del_rcu(&key->node);
1186 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1187 kfree_rcu(key, rcu);
1190 EXPORT_SYMBOL(tcp_md5_do_del);
1192 static void tcp_clear_md5_list(struct sock *sk)
1194 struct tcp_sock *tp = tcp_sk(sk);
1195 struct tcp_md5sig_key *key;
1196 struct hlist_node *n;
1197 struct tcp_md5sig_info *md5sig;
1199 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1201 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1202 hlist_del_rcu(&key->node);
1203 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1204 kfree_rcu(key, rcu);
1208 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1209 sockptr_t optval, int optlen)
1211 struct tcp_md5sig cmd;
1212 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1213 const union tcp_md5_addr *addr;
1217 if (optlen < sizeof(cmd))
1220 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1223 if (sin->sin_family != AF_INET)
1226 if (optname == TCP_MD5SIG_EXT &&
1227 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1228 prefixlen = cmd.tcpm_prefixlen;
1233 if (optname == TCP_MD5SIG_EXT &&
1234 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1235 struct net_device *dev;
1238 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1239 if (dev && netif_is_l3_master(dev))
1240 l3index = dev->ifindex;
1244 /* ok to reference set/not set outside of rcu;
1245 * right now device MUST be an L3 master
1247 if (!dev || !l3index)
1251 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1253 if (!cmd.tcpm_keylen)
1254 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1256 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1259 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1260 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1263 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1264 __be32 daddr, __be32 saddr,
1265 const struct tcphdr *th, int nbytes)
1267 struct tcp4_pseudohdr *bp;
1268 struct scatterlist sg;
1275 bp->protocol = IPPROTO_TCP;
1276 bp->len = cpu_to_be16(nbytes);
1278 _th = (struct tcphdr *)(bp + 1);
1279 memcpy(_th, th, sizeof(*th));
1282 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1283 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1284 sizeof(*bp) + sizeof(*th));
1285 return crypto_ahash_update(hp->md5_req);
1288 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1289 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1291 struct tcp_md5sig_pool *hp;
1292 struct ahash_request *req;
1294 hp = tcp_get_md5sig_pool();
1296 goto clear_hash_noput;
1299 if (crypto_ahash_init(req))
1301 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1303 if (tcp_md5_hash_key(hp, key))
1305 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1306 if (crypto_ahash_final(req))
1309 tcp_put_md5sig_pool();
1313 tcp_put_md5sig_pool();
1315 memset(md5_hash, 0, 16);
1319 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1320 const struct sock *sk,
1321 const struct sk_buff *skb)
1323 struct tcp_md5sig_pool *hp;
1324 struct ahash_request *req;
1325 const struct tcphdr *th = tcp_hdr(skb);
1326 __be32 saddr, daddr;
1328 if (sk) { /* valid for establish/request sockets */
1329 saddr = sk->sk_rcv_saddr;
1330 daddr = sk->sk_daddr;
1332 const struct iphdr *iph = ip_hdr(skb);
1337 hp = tcp_get_md5sig_pool();
1339 goto clear_hash_noput;
1342 if (crypto_ahash_init(req))
1345 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1347 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1349 if (tcp_md5_hash_key(hp, key))
1351 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1352 if (crypto_ahash_final(req))
1355 tcp_put_md5sig_pool();
1359 tcp_put_md5sig_pool();
1361 memset(md5_hash, 0, 16);
1364 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1368 /* Called with rcu_read_lock() */
1369 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1370 const struct sk_buff *skb,
1373 #ifdef CONFIG_TCP_MD5SIG
1375 * This gets called for each TCP segment that arrives
1376 * so we want to be efficient.
1377 * We have 3 drop cases:
1378 * o No MD5 hash and one expected.
1379 * o MD5 hash and we're not expecting one.
1380 * o MD5 hash and its wrong.
1382 const __u8 *hash_location = NULL;
1383 struct tcp_md5sig_key *hash_expected;
1384 const struct iphdr *iph = ip_hdr(skb);
1385 const struct tcphdr *th = tcp_hdr(skb);
1386 const union tcp_md5_addr *addr;
1387 unsigned char newhash[16];
1388 int genhash, l3index;
1390 /* sdif set, means packet ingressed via a device
1391 * in an L3 domain and dif is set to the l3mdev
1393 l3index = sdif ? dif : 0;
1395 addr = (union tcp_md5_addr *)&iph->saddr;
1396 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1397 hash_location = tcp_parse_md5sig_option(th);
1399 /* We've parsed the options - do we have a hash? */
1400 if (!hash_expected && !hash_location)
1403 if (hash_expected && !hash_location) {
1404 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1408 if (!hash_expected && hash_location) {
1409 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1413 /* Okay, so this is hash_expected and hash_location -
1414 * so we need to calculate the checksum.
1416 genhash = tcp_v4_md5_hash_skb(newhash,
1420 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1421 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1422 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1423 &iph->saddr, ntohs(th->source),
1424 &iph->daddr, ntohs(th->dest),
1425 genhash ? " tcp_v4_calc_md5_hash failed"
1434 static void tcp_v4_init_req(struct request_sock *req,
1435 const struct sock *sk_listener,
1436 struct sk_buff *skb)
1438 struct inet_request_sock *ireq = inet_rsk(req);
1439 struct net *net = sock_net(sk_listener);
1441 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1442 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1443 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1446 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1448 const struct request_sock *req)
1450 return inet_csk_route_req(sk, &fl->u.ip4, req);
1453 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1455 .obj_size = sizeof(struct tcp_request_sock),
1456 .rtx_syn_ack = tcp_rtx_synack,
1457 .send_ack = tcp_v4_reqsk_send_ack,
1458 .destructor = tcp_v4_reqsk_destructor,
1459 .send_reset = tcp_v4_send_reset,
1460 .syn_ack_timeout = tcp_syn_ack_timeout,
1463 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1464 .mss_clamp = TCP_MSS_DEFAULT,
1465 #ifdef CONFIG_TCP_MD5SIG
1466 .req_md5_lookup = tcp_v4_md5_lookup,
1467 .calc_md5_hash = tcp_v4_md5_hash_skb,
1469 .init_req = tcp_v4_init_req,
1470 #ifdef CONFIG_SYN_COOKIES
1471 .cookie_init_seq = cookie_v4_init_sequence,
1473 .route_req = tcp_v4_route_req,
1474 .init_seq = tcp_v4_init_seq,
1475 .init_ts_off = tcp_v4_init_ts_off,
1476 .send_synack = tcp_v4_send_synack,
1479 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1481 /* Never answer to SYNs send to broadcast or multicast */
1482 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1485 return tcp_conn_request(&tcp_request_sock_ops,
1486 &tcp_request_sock_ipv4_ops, sk, skb);
1492 EXPORT_SYMBOL(tcp_v4_conn_request);
1496 * The three way handshake has completed - we got a valid synack -
1497 * now create the new socket.
1499 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1500 struct request_sock *req,
1501 struct dst_entry *dst,
1502 struct request_sock *req_unhash,
1505 struct inet_request_sock *ireq;
1506 bool found_dup_sk = false;
1507 struct inet_sock *newinet;
1508 struct tcp_sock *newtp;
1510 #ifdef CONFIG_TCP_MD5SIG
1511 const union tcp_md5_addr *addr;
1512 struct tcp_md5sig_key *key;
1515 struct ip_options_rcu *inet_opt;
1517 if (sk_acceptq_is_full(sk))
1520 newsk = tcp_create_openreq_child(sk, req, skb);
1524 newsk->sk_gso_type = SKB_GSO_TCPV4;
1525 inet_sk_rx_dst_set(newsk, skb);
1527 newtp = tcp_sk(newsk);
1528 newinet = inet_sk(newsk);
1529 ireq = inet_rsk(req);
1530 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1531 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1532 newsk->sk_bound_dev_if = ireq->ir_iif;
1533 newinet->inet_saddr = ireq->ir_loc_addr;
1534 inet_opt = rcu_dereference(ireq->ireq_opt);
1535 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1536 newinet->mc_index = inet_iif(skb);
1537 newinet->mc_ttl = ip_hdr(skb)->ttl;
1538 newinet->rcv_tos = ip_hdr(skb)->tos;
1539 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1541 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1542 newinet->inet_id = prandom_u32();
1544 /* Set ToS of the new socket based upon the value of incoming SYN. */
1545 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1546 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1549 dst = inet_csk_route_child_sock(sk, newsk, req);
1553 /* syncookie case : see end of cookie_v4_check() */
1555 sk_setup_caps(newsk, dst);
1557 tcp_ca_openreq_child(newsk, dst);
1559 tcp_sync_mss(newsk, dst_mtu(dst));
1560 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1562 tcp_initialize_rcv_mss(newsk);
1564 #ifdef CONFIG_TCP_MD5SIG
1565 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1566 /* Copy over the MD5 key from the original socket */
1567 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1568 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1571 * We're using one, so create a matching key
1572 * on the newsk structure. If we fail to get
1573 * memory, then we end up not copying the key
1576 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1577 key->key, key->keylen, GFP_ATOMIC);
1578 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1582 if (__inet_inherit_port(sk, newsk) < 0)
1584 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1586 if (likely(*own_req)) {
1587 tcp_move_syn(newtp, req);
1588 ireq->ireq_opt = NULL;
1590 if (!req_unhash && found_dup_sk) {
1591 /* This code path should only be executed in the
1592 * syncookie case only
1594 bh_unlock_sock(newsk);
1598 newinet->inet_opt = NULL;
1604 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1611 newinet->inet_opt = NULL;
1612 inet_csk_prepare_forced_close(newsk);
1616 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1618 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1620 #ifdef CONFIG_SYN_COOKIES
1621 const struct tcphdr *th = tcp_hdr(skb);
1624 sk = cookie_v4_check(sk, skb);
1629 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1630 struct tcphdr *th, u32 *cookie)
1633 #ifdef CONFIG_SYN_COOKIES
1634 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1635 &tcp_request_sock_ipv4_ops, sk, th);
1637 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1638 tcp_synq_overflow(sk);
1644 /* The socket must have it's spinlock held when we get
1645 * here, unless it is a TCP_LISTEN socket.
1647 * We have a potential double-lock case here, so even when
1648 * doing backlog processing we use the BH locking scheme.
1649 * This is because we cannot sleep with the original spinlock
1652 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1656 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1657 struct dst_entry *dst = sk->sk_rx_dst;
1659 sock_rps_save_rxhash(sk, skb);
1660 sk_mark_napi_id(sk, skb);
1662 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1663 !dst->ops->check(dst, 0)) {
1665 sk->sk_rx_dst = NULL;
1668 tcp_rcv_established(sk, skb);
1672 if (tcp_checksum_complete(skb))
1675 if (sk->sk_state == TCP_LISTEN) {
1676 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1681 if (tcp_child_process(sk, nsk, skb)) {
1688 sock_rps_save_rxhash(sk, skb);
1690 if (tcp_rcv_state_process(sk, skb)) {
1697 tcp_v4_send_reset(rsk, skb);
1700 /* Be careful here. If this function gets more complicated and
1701 * gcc suffers from register pressure on the x86, sk (in %ebx)
1702 * might be destroyed here. This current version compiles correctly,
1703 * but you have been warned.
1708 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1709 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1712 EXPORT_SYMBOL(tcp_v4_do_rcv);
1714 int tcp_v4_early_demux(struct sk_buff *skb)
1716 const struct iphdr *iph;
1717 const struct tcphdr *th;
1720 if (skb->pkt_type != PACKET_HOST)
1723 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1729 if (th->doff < sizeof(struct tcphdr) / 4)
1732 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1733 iph->saddr, th->source,
1734 iph->daddr, ntohs(th->dest),
1735 skb->skb_iif, inet_sdif(skb));
1738 skb->destructor = sock_edemux;
1739 if (sk_fullsock(sk)) {
1740 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1743 dst = dst_check(dst, 0);
1745 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1746 skb_dst_set_noref(skb, dst);
1752 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1754 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1755 struct skb_shared_info *shinfo;
1756 const struct tcphdr *th;
1757 struct tcphdr *thtail;
1758 struct sk_buff *tail;
1759 unsigned int hdrlen;
1764 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1765 * we can fix skb->truesize to its real value to avoid future drops.
1766 * This is valid because skb is not yet charged to the socket.
1767 * It has been noticed pure SACK packets were sometimes dropped
1768 * (if cooked by drivers without copybreak feature).
1774 if (unlikely(tcp_checksum_complete(skb))) {
1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1781 /* Attempt coalescing to last skb in backlog, even if we are
1783 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1785 th = (const struct tcphdr *)skb->data;
1786 hdrlen = th->doff * 4;
1787 shinfo = skb_shinfo(skb);
1789 if (!shinfo->gso_size)
1790 shinfo->gso_size = skb->len - hdrlen;
1792 if (!shinfo->gso_segs)
1793 shinfo->gso_segs = 1;
1795 tail = sk->sk_backlog.tail;
1798 thtail = (struct tcphdr *)tail->data;
1800 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1801 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1802 ((TCP_SKB_CB(tail)->tcp_flags |
1803 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1804 !((TCP_SKB_CB(tail)->tcp_flags &
1805 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1806 ((TCP_SKB_CB(tail)->tcp_flags ^
1807 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1808 #ifdef CONFIG_TLS_DEVICE
1809 tail->decrypted != skb->decrypted ||
1811 thtail->doff != th->doff ||
1812 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1815 __skb_pull(skb, hdrlen);
1816 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1819 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821 thtail->window = th->window;
1824 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825 * thtail->fin, so that the fast path in tcp_rcv_established()
1826 * is not entered if we append a packet with a FIN.
1827 * SYN, RST, URG are not present.
1828 * ACK is set on both packets.
1829 * PSH : we do not really care in TCP stack,
1830 * at least for 'GRO' packets.
1832 thtail->fin |= th->fin;
1833 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1835 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836 TCP_SKB_CB(tail)->has_rxtstamp = true;
1837 tail->tstamp = skb->tstamp;
1838 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841 /* Not as strict as GRO. We only need to carry mss max value */
1842 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1843 skb_shinfo(tail)->gso_size);
1845 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1846 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1848 sk->sk_backlog.len += delta;
1849 __NET_INC_STATS(sock_net(sk),
1850 LINUX_MIB_TCPBACKLOGCOALESCE);
1851 kfree_skb_partial(skb, fragstolen);
1854 __skb_push(skb, hdrlen);
1857 /* Only socket owner can try to collapse/prune rx queues
1858 * to reduce memory overhead, so add a little headroom here.
1859 * Few sockets backlog are possibly concurrently non empty.
1863 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1865 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1870 EXPORT_SYMBOL(tcp_add_backlog);
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1874 struct tcphdr *th = (struct tcphdr *)skb->data;
1876 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1878 EXPORT_SYMBOL(tcp_filter);
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1882 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883 sizeof(struct inet_skb_parm));
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887 const struct tcphdr *th)
1889 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890 * barrier() makes sure compiler wont play fool^Waliasing games.
1892 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893 sizeof(struct inet_skb_parm));
1896 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898 skb->len - th->doff * 4);
1899 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903 TCP_SKB_CB(skb)->sacked = 0;
1904 TCP_SKB_CB(skb)->has_rxtstamp =
1905 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1912 int tcp_v4_rcv(struct sk_buff *skb)
1914 struct net *net = dev_net(skb->dev);
1915 struct sk_buff *skb_to_free;
1916 int sdif = inet_sdif(skb);
1917 int dif = inet_iif(skb);
1918 const struct iphdr *iph;
1919 const struct tcphdr *th;
1924 if (skb->pkt_type != PACKET_HOST)
1927 /* Count it even if it's bad */
1928 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1930 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1933 th = (const struct tcphdr *)skb->data;
1935 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1937 if (!pskb_may_pull(skb, th->doff * 4))
1940 /* An explanation is required here, I think.
1941 * Packet length and doff are validated by header prediction,
1942 * provided case of th->doff==0 is eliminated.
1943 * So, we defer the checks. */
1945 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1948 th = (const struct tcphdr *)skb->data;
1951 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1952 th->dest, sdif, &refcounted);
1957 if (sk->sk_state == TCP_TIME_WAIT)
1960 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1961 struct request_sock *req = inet_reqsk(sk);
1962 bool req_stolen = false;
1965 sk = req->rsk_listener;
1966 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1967 sk_drops_add(sk, skb);
1971 if (tcp_checksum_complete(skb)) {
1975 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1976 inet_csk_reqsk_queue_drop_and_put(sk, req);
1979 /* We own a reference on the listener, increase it again
1980 * as we might lose it too soon.
1985 if (!tcp_filter(sk, skb)) {
1986 th = (const struct tcphdr *)skb->data;
1988 tcp_v4_fill_cb(skb, iph, th);
1989 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1994 /* Another cpu got exclusive access to req
1995 * and created a full blown socket.
1996 * Try to feed this packet to this socket
1997 * instead of discarding it.
1999 tcp_v4_restore_cb(skb);
2003 goto discard_and_relse;
2007 tcp_v4_restore_cb(skb);
2008 } else if (tcp_child_process(sk, nsk, skb)) {
2009 tcp_v4_send_reset(nsk, skb);
2010 goto discard_and_relse;
2016 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2017 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2018 goto discard_and_relse;
2021 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2022 goto discard_and_relse;
2024 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2025 goto discard_and_relse;
2029 if (tcp_filter(sk, skb))
2030 goto discard_and_relse;
2031 th = (const struct tcphdr *)skb->data;
2033 tcp_v4_fill_cb(skb, iph, th);
2037 if (sk->sk_state == TCP_LISTEN) {
2038 ret = tcp_v4_do_rcv(sk, skb);
2039 goto put_and_return;
2042 sk_incoming_cpu_update(sk);
2044 bh_lock_sock_nested(sk);
2045 tcp_segs_in(tcp_sk(sk), skb);
2047 if (!sock_owned_by_user(sk)) {
2048 skb_to_free = sk->sk_rx_skb_cache;
2049 sk->sk_rx_skb_cache = NULL;
2050 ret = tcp_v4_do_rcv(sk, skb);
2052 if (tcp_add_backlog(sk, skb))
2053 goto discard_and_relse;
2058 __kfree_skb(skb_to_free);
2067 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2070 tcp_v4_fill_cb(skb, iph, th);
2072 if (tcp_checksum_complete(skb)) {
2074 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2076 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2078 tcp_v4_send_reset(NULL, skb);
2082 /* Discard frame. */
2087 sk_drops_add(sk, skb);
2093 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2094 inet_twsk_put(inet_twsk(sk));
2098 tcp_v4_fill_cb(skb, iph, th);
2100 if (tcp_checksum_complete(skb)) {
2101 inet_twsk_put(inet_twsk(sk));
2104 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2106 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2109 iph->saddr, th->source,
2110 iph->daddr, th->dest,
2114 inet_twsk_deschedule_put(inet_twsk(sk));
2116 tcp_v4_restore_cb(skb);
2124 tcp_v4_timewait_ack(sk, skb);
2127 tcp_v4_send_reset(sk, skb);
2128 inet_twsk_deschedule_put(inet_twsk(sk));
2130 case TCP_TW_SUCCESS:;
2135 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2136 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2137 .twsk_unique = tcp_twsk_unique,
2138 .twsk_destructor= tcp_twsk_destructor,
2141 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2143 struct dst_entry *dst = skb_dst(skb);
2145 if (dst && dst_hold_safe(dst)) {
2146 sk->sk_rx_dst = dst;
2147 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2150 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2152 const struct inet_connection_sock_af_ops ipv4_specific = {
2153 .queue_xmit = ip_queue_xmit,
2154 .send_check = tcp_v4_send_check,
2155 .rebuild_header = inet_sk_rebuild_header,
2156 .sk_rx_dst_set = inet_sk_rx_dst_set,
2157 .conn_request = tcp_v4_conn_request,
2158 .syn_recv_sock = tcp_v4_syn_recv_sock,
2159 .net_header_len = sizeof(struct iphdr),
2160 .setsockopt = ip_setsockopt,
2161 .getsockopt = ip_getsockopt,
2162 .addr2sockaddr = inet_csk_addr2sockaddr,
2163 .sockaddr_len = sizeof(struct sockaddr_in),
2164 .mtu_reduced = tcp_v4_mtu_reduced,
2166 EXPORT_SYMBOL(ipv4_specific);
2168 #ifdef CONFIG_TCP_MD5SIG
2169 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2170 .md5_lookup = tcp_v4_md5_lookup,
2171 .calc_md5_hash = tcp_v4_md5_hash_skb,
2172 .md5_parse = tcp_v4_parse_md5_keys,
2176 /* NOTE: A lot of things set to zero explicitly by call to
2177 * sk_alloc() so need not be done here.
2179 static int tcp_v4_init_sock(struct sock *sk)
2181 struct inet_connection_sock *icsk = inet_csk(sk);
2185 icsk->icsk_af_ops = &ipv4_specific;
2187 #ifdef CONFIG_TCP_MD5SIG
2188 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2194 void tcp_v4_destroy_sock(struct sock *sk)
2196 struct tcp_sock *tp = tcp_sk(sk);
2198 trace_tcp_destroy_sock(sk);
2200 tcp_clear_xmit_timers(sk);
2202 tcp_cleanup_congestion_control(sk);
2204 tcp_cleanup_ulp(sk);
2206 /* Cleanup up the write buffer. */
2207 tcp_write_queue_purge(sk);
2209 /* Check if we want to disable active TFO */
2210 tcp_fastopen_active_disable_ofo_check(sk);
2212 /* Cleans up our, hopefully empty, out_of_order_queue. */
2213 skb_rbtree_purge(&tp->out_of_order_queue);
2215 #ifdef CONFIG_TCP_MD5SIG
2216 /* Clean up the MD5 key list, if any */
2217 if (tp->md5sig_info) {
2218 tcp_clear_md5_list(sk);
2219 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2220 tp->md5sig_info = NULL;
2224 /* Clean up a referenced TCP bind bucket. */
2225 if (inet_csk(sk)->icsk_bind_hash)
2228 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2230 /* If socket is aborted during connect operation */
2231 tcp_free_fastopen_req(tp);
2232 tcp_fastopen_destroy_cipher(sk);
2233 tcp_saved_syn_free(tp);
2235 sk_sockets_allocated_dec(sk);
2237 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2239 #ifdef CONFIG_PROC_FS
2240 /* Proc filesystem TCP sock list dumping. */
2243 * Get next listener socket follow cur. If cur is NULL, get first socket
2244 * starting from bucket given in st->bucket; when st->bucket is zero the
2245 * very first socket in the hash table is returned.
2247 static void *listening_get_next(struct seq_file *seq, void *cur)
2249 struct tcp_seq_afinfo *afinfo;
2250 struct tcp_iter_state *st = seq->private;
2251 struct net *net = seq_file_net(seq);
2252 struct inet_listen_hashbucket *ilb;
2253 struct hlist_nulls_node *node;
2254 struct sock *sk = cur;
2256 if (st->bpf_seq_afinfo)
2257 afinfo = st->bpf_seq_afinfo;
2259 afinfo = PDE_DATA(file_inode(seq->file));
2263 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2264 spin_lock(&ilb->lock);
2265 sk = sk_nulls_head(&ilb->nulls_head);
2269 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2273 sk = sk_nulls_next(sk);
2275 sk_nulls_for_each_from(sk, node) {
2276 if (!net_eq(sock_net(sk), net))
2278 if (afinfo->family == AF_UNSPEC ||
2279 sk->sk_family == afinfo->family)
2282 spin_unlock(&ilb->lock);
2284 if (++st->bucket < INET_LHTABLE_SIZE)
2289 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2291 struct tcp_iter_state *st = seq->private;
2296 rc = listening_get_next(seq, NULL);
2298 while (rc && *pos) {
2299 rc = listening_get_next(seq, rc);
2305 static inline bool empty_bucket(const struct tcp_iter_state *st)
2307 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2311 * Get first established socket starting from bucket given in st->bucket.
2312 * If st->bucket is zero, the very first socket in the hash is returned.
2314 static void *established_get_first(struct seq_file *seq)
2316 struct tcp_seq_afinfo *afinfo;
2317 struct tcp_iter_state *st = seq->private;
2318 struct net *net = seq_file_net(seq);
2321 if (st->bpf_seq_afinfo)
2322 afinfo = st->bpf_seq_afinfo;
2324 afinfo = PDE_DATA(file_inode(seq->file));
2327 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2329 struct hlist_nulls_node *node;
2330 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2332 /* Lockless fast path for the common case of empty buckets */
2333 if (empty_bucket(st))
2337 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2338 if ((afinfo->family != AF_UNSPEC &&
2339 sk->sk_family != afinfo->family) ||
2340 !net_eq(sock_net(sk), net)) {
2346 spin_unlock_bh(lock);
2352 static void *established_get_next(struct seq_file *seq, void *cur)
2354 struct tcp_seq_afinfo *afinfo;
2355 struct sock *sk = cur;
2356 struct hlist_nulls_node *node;
2357 struct tcp_iter_state *st = seq->private;
2358 struct net *net = seq_file_net(seq);
2360 if (st->bpf_seq_afinfo)
2361 afinfo = st->bpf_seq_afinfo;
2363 afinfo = PDE_DATA(file_inode(seq->file));
2368 sk = sk_nulls_next(sk);
2370 sk_nulls_for_each_from(sk, node) {
2371 if ((afinfo->family == AF_UNSPEC ||
2372 sk->sk_family == afinfo->family) &&
2373 net_eq(sock_net(sk), net))
2377 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2379 return established_get_first(seq);
2382 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2384 struct tcp_iter_state *st = seq->private;
2388 rc = established_get_first(seq);
2391 rc = established_get_next(seq, rc);
2397 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2400 struct tcp_iter_state *st = seq->private;
2402 st->state = TCP_SEQ_STATE_LISTENING;
2403 rc = listening_get_idx(seq, &pos);
2406 st->state = TCP_SEQ_STATE_ESTABLISHED;
2407 rc = established_get_idx(seq, pos);
2413 static void *tcp_seek_last_pos(struct seq_file *seq)
2415 struct tcp_iter_state *st = seq->private;
2416 int offset = st->offset;
2417 int orig_num = st->num;
2420 switch (st->state) {
2421 case TCP_SEQ_STATE_LISTENING:
2422 if (st->bucket >= INET_LHTABLE_SIZE)
2424 st->state = TCP_SEQ_STATE_LISTENING;
2425 rc = listening_get_next(seq, NULL);
2426 while (offset-- && rc)
2427 rc = listening_get_next(seq, rc);
2431 st->state = TCP_SEQ_STATE_ESTABLISHED;
2433 case TCP_SEQ_STATE_ESTABLISHED:
2434 if (st->bucket > tcp_hashinfo.ehash_mask)
2436 rc = established_get_first(seq);
2437 while (offset-- && rc)
2438 rc = established_get_next(seq, rc);
2446 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2448 struct tcp_iter_state *st = seq->private;
2451 if (*pos && *pos == st->last_pos) {
2452 rc = tcp_seek_last_pos(seq);
2457 st->state = TCP_SEQ_STATE_LISTENING;
2461 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2464 st->last_pos = *pos;
2467 EXPORT_SYMBOL(tcp_seq_start);
2469 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2471 struct tcp_iter_state *st = seq->private;
2474 if (v == SEQ_START_TOKEN) {
2475 rc = tcp_get_idx(seq, 0);
2479 switch (st->state) {
2480 case TCP_SEQ_STATE_LISTENING:
2481 rc = listening_get_next(seq, v);
2483 st->state = TCP_SEQ_STATE_ESTABLISHED;
2486 rc = established_get_first(seq);
2489 case TCP_SEQ_STATE_ESTABLISHED:
2490 rc = established_get_next(seq, v);
2495 st->last_pos = *pos;
2498 EXPORT_SYMBOL(tcp_seq_next);
2500 void tcp_seq_stop(struct seq_file *seq, void *v)
2502 struct tcp_iter_state *st = seq->private;
2504 switch (st->state) {
2505 case TCP_SEQ_STATE_LISTENING:
2506 if (v != SEQ_START_TOKEN)
2507 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2509 case TCP_SEQ_STATE_ESTABLISHED:
2511 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2515 EXPORT_SYMBOL(tcp_seq_stop);
2517 static void get_openreq4(const struct request_sock *req,
2518 struct seq_file *f, int i)
2520 const struct inet_request_sock *ireq = inet_rsk(req);
2521 long delta = req->rsk_timer.expires - jiffies;
2523 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2524 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2529 ntohs(ireq->ir_rmt_port),
2531 0, 0, /* could print option size, but that is af dependent. */
2532 1, /* timers active (only the expire timer) */
2533 jiffies_delta_to_clock_t(delta),
2535 from_kuid_munged(seq_user_ns(f),
2536 sock_i_uid(req->rsk_listener)),
2537 0, /* non standard timer */
2538 0, /* open_requests have no inode */
2543 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2546 unsigned long timer_expires;
2547 const struct tcp_sock *tp = tcp_sk(sk);
2548 const struct inet_connection_sock *icsk = inet_csk(sk);
2549 const struct inet_sock *inet = inet_sk(sk);
2550 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2551 __be32 dest = inet->inet_daddr;
2552 __be32 src = inet->inet_rcv_saddr;
2553 __u16 destp = ntohs(inet->inet_dport);
2554 __u16 srcp = ntohs(inet->inet_sport);
2558 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2559 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2560 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2562 timer_expires = icsk->icsk_timeout;
2563 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2565 timer_expires = icsk->icsk_timeout;
2566 } else if (timer_pending(&sk->sk_timer)) {
2568 timer_expires = sk->sk_timer.expires;
2571 timer_expires = jiffies;
2574 state = inet_sk_state_load(sk);
2575 if (state == TCP_LISTEN)
2576 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2578 /* Because we don't lock the socket,
2579 * we might find a transient negative value.
2581 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2582 READ_ONCE(tp->copied_seq), 0);
2584 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2585 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2586 i, src, srcp, dest, destp, state,
2587 READ_ONCE(tp->write_seq) - tp->snd_una,
2590 jiffies_delta_to_clock_t(timer_expires - jiffies),
2591 icsk->icsk_retransmits,
2592 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2593 icsk->icsk_probes_out,
2595 refcount_read(&sk->sk_refcnt), sk,
2596 jiffies_to_clock_t(icsk->icsk_rto),
2597 jiffies_to_clock_t(icsk->icsk_ack.ato),
2598 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2600 state == TCP_LISTEN ?
2601 fastopenq->max_qlen :
2602 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2605 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2606 struct seq_file *f, int i)
2608 long delta = tw->tw_timer.expires - jiffies;
2612 dest = tw->tw_daddr;
2613 src = tw->tw_rcv_saddr;
2614 destp = ntohs(tw->tw_dport);
2615 srcp = ntohs(tw->tw_sport);
2617 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2618 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2619 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2620 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2621 refcount_read(&tw->tw_refcnt), tw);
2626 static int tcp4_seq_show(struct seq_file *seq, void *v)
2628 struct tcp_iter_state *st;
2629 struct sock *sk = v;
2631 seq_setwidth(seq, TMPSZ - 1);
2632 if (v == SEQ_START_TOKEN) {
2633 seq_puts(seq, " sl local_address rem_address st tx_queue "
2634 "rx_queue tr tm->when retrnsmt uid timeout "
2640 if (sk->sk_state == TCP_TIME_WAIT)
2641 get_timewait4_sock(v, seq, st->num);
2642 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2643 get_openreq4(v, seq, st->num);
2645 get_tcp4_sock(v, seq, st->num);
2651 #ifdef CONFIG_BPF_SYSCALL
2652 struct bpf_iter__tcp {
2653 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2654 __bpf_md_ptr(struct sock_common *, sk_common);
2655 uid_t uid __aligned(8);
2658 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2659 struct sock_common *sk_common, uid_t uid)
2661 struct bpf_iter__tcp ctx;
2663 meta->seq_num--; /* skip SEQ_START_TOKEN */
2665 ctx.sk_common = sk_common;
2667 return bpf_iter_run_prog(prog, &ctx);
2670 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2672 struct bpf_iter_meta meta;
2673 struct bpf_prog *prog;
2674 struct sock *sk = v;
2677 if (v == SEQ_START_TOKEN)
2680 if (sk->sk_state == TCP_TIME_WAIT) {
2682 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2683 const struct request_sock *req = v;
2685 uid = from_kuid_munged(seq_user_ns(seq),
2686 sock_i_uid(req->rsk_listener));
2688 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2692 prog = bpf_iter_get_info(&meta, false);
2693 return tcp_prog_seq_show(prog, &meta, v, uid);
2696 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2698 struct bpf_iter_meta meta;
2699 struct bpf_prog *prog;
2703 prog = bpf_iter_get_info(&meta, true);
2705 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2708 tcp_seq_stop(seq, v);
2711 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2712 .show = bpf_iter_tcp_seq_show,
2713 .start = tcp_seq_start,
2714 .next = tcp_seq_next,
2715 .stop = bpf_iter_tcp_seq_stop,
2719 static const struct seq_operations tcp4_seq_ops = {
2720 .show = tcp4_seq_show,
2721 .start = tcp_seq_start,
2722 .next = tcp_seq_next,
2723 .stop = tcp_seq_stop,
2726 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2730 static int __net_init tcp4_proc_init_net(struct net *net)
2732 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2733 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2738 static void __net_exit tcp4_proc_exit_net(struct net *net)
2740 remove_proc_entry("tcp", net->proc_net);
2743 static struct pernet_operations tcp4_net_ops = {
2744 .init = tcp4_proc_init_net,
2745 .exit = tcp4_proc_exit_net,
2748 int __init tcp4_proc_init(void)
2750 return register_pernet_subsys(&tcp4_net_ops);
2753 void tcp4_proc_exit(void)
2755 unregister_pernet_subsys(&tcp4_net_ops);
2757 #endif /* CONFIG_PROC_FS */
2759 struct proto tcp_prot = {
2761 .owner = THIS_MODULE,
2763 .pre_connect = tcp_v4_pre_connect,
2764 .connect = tcp_v4_connect,
2765 .disconnect = tcp_disconnect,
2766 .accept = inet_csk_accept,
2768 .init = tcp_v4_init_sock,
2769 .destroy = tcp_v4_destroy_sock,
2770 .shutdown = tcp_shutdown,
2771 .setsockopt = tcp_setsockopt,
2772 .getsockopt = tcp_getsockopt,
2773 .keepalive = tcp_set_keepalive,
2774 .recvmsg = tcp_recvmsg,
2775 .sendmsg = tcp_sendmsg,
2776 .sendpage = tcp_sendpage,
2777 .backlog_rcv = tcp_v4_do_rcv,
2778 .release_cb = tcp_release_cb,
2780 .unhash = inet_unhash,
2781 .get_port = inet_csk_get_port,
2782 .enter_memory_pressure = tcp_enter_memory_pressure,
2783 .leave_memory_pressure = tcp_leave_memory_pressure,
2784 .stream_memory_free = tcp_stream_memory_free,
2785 .sockets_allocated = &tcp_sockets_allocated,
2786 .orphan_count = &tcp_orphan_count,
2787 .memory_allocated = &tcp_memory_allocated,
2788 .memory_pressure = &tcp_memory_pressure,
2789 .sysctl_mem = sysctl_tcp_mem,
2790 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2791 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2792 .max_header = MAX_TCP_HEADER,
2793 .obj_size = sizeof(struct tcp_sock),
2794 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2795 .twsk_prot = &tcp_timewait_sock_ops,
2796 .rsk_prot = &tcp_request_sock_ops,
2797 .h.hashinfo = &tcp_hashinfo,
2798 .no_autobind = true,
2799 .diag_destroy = tcp_abort,
2801 EXPORT_SYMBOL(tcp_prot);
2803 static void __net_exit tcp_sk_exit(struct net *net)
2807 if (net->ipv4.tcp_congestion_control)
2808 bpf_module_put(net->ipv4.tcp_congestion_control,
2809 net->ipv4.tcp_congestion_control->owner);
2811 for_each_possible_cpu(cpu)
2812 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2813 free_percpu(net->ipv4.tcp_sk);
2816 static int __net_init tcp_sk_init(struct net *net)
2820 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2821 if (!net->ipv4.tcp_sk)
2824 for_each_possible_cpu(cpu) {
2827 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2831 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2833 /* Please enforce IP_DF and IPID==0 for RST and
2834 * ACK sent in SYN-RECV and TIME-WAIT state.
2836 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2838 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2841 net->ipv4.sysctl_tcp_ecn = 2;
2842 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2844 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2845 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2846 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2847 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2848 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2850 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2851 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2852 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2854 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2855 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2856 net->ipv4.sysctl_tcp_syncookies = 1;
2857 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2858 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2859 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2860 net->ipv4.sysctl_tcp_orphan_retries = 0;
2861 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2862 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2863 net->ipv4.sysctl_tcp_tw_reuse = 2;
2864 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2866 cnt = tcp_hashinfo.ehash_mask + 1;
2867 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2868 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2870 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2871 net->ipv4.sysctl_tcp_sack = 1;
2872 net->ipv4.sysctl_tcp_window_scaling = 1;
2873 net->ipv4.sysctl_tcp_timestamps = 1;
2874 net->ipv4.sysctl_tcp_early_retrans = 3;
2875 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2876 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2877 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2878 net->ipv4.sysctl_tcp_max_reordering = 300;
2879 net->ipv4.sysctl_tcp_dsack = 1;
2880 net->ipv4.sysctl_tcp_app_win = 31;
2881 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2882 net->ipv4.sysctl_tcp_frto = 2;
2883 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2884 /* This limits the percentage of the congestion window which we
2885 * will allow a single TSO frame to consume. Building TSO frames
2886 * which are too large can cause TCP streams to be bursty.
2888 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2889 /* Default TSQ limit of 16 TSO segments */
2890 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2891 /* rfc5961 challenge ack rate limiting */
2892 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2893 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2894 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2895 net->ipv4.sysctl_tcp_autocorking = 1;
2896 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2897 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2898 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2899 if (net != &init_net) {
2900 memcpy(net->ipv4.sysctl_tcp_rmem,
2901 init_net.ipv4.sysctl_tcp_rmem,
2902 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2903 memcpy(net->ipv4.sysctl_tcp_wmem,
2904 init_net.ipv4.sysctl_tcp_wmem,
2905 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2907 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2908 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2909 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2910 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2911 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2912 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2913 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2915 /* Reno is always built in */
2916 if (!net_eq(net, &init_net) &&
2917 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2918 init_net.ipv4.tcp_congestion_control->owner))
2919 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2921 net->ipv4.tcp_congestion_control = &tcp_reno;
2930 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2934 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2936 list_for_each_entry(net, net_exit_list, exit_list)
2937 tcp_fastopen_ctx_destroy(net);
2940 static struct pernet_operations __net_initdata tcp_sk_ops = {
2941 .init = tcp_sk_init,
2942 .exit = tcp_sk_exit,
2943 .exit_batch = tcp_sk_exit_batch,
2946 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2947 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2948 struct sock_common *sk_common, uid_t uid)
2950 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2952 struct tcp_iter_state *st = priv_data;
2953 struct tcp_seq_afinfo *afinfo;
2956 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2960 afinfo->family = AF_UNSPEC;
2961 st->bpf_seq_afinfo = afinfo;
2962 ret = bpf_iter_init_seq_net(priv_data, aux);
2968 static void bpf_iter_fini_tcp(void *priv_data)
2970 struct tcp_iter_state *st = priv_data;
2972 kfree(st->bpf_seq_afinfo);
2973 bpf_iter_fini_seq_net(priv_data);
2976 static const struct bpf_iter_seq_info tcp_seq_info = {
2977 .seq_ops = &bpf_iter_tcp_seq_ops,
2978 .init_seq_private = bpf_iter_init_tcp,
2979 .fini_seq_private = bpf_iter_fini_tcp,
2980 .seq_priv_size = sizeof(struct tcp_iter_state),
2983 static struct bpf_iter_reg tcp_reg_info = {
2985 .ctx_arg_info_size = 1,
2987 { offsetof(struct bpf_iter__tcp, sk_common),
2988 PTR_TO_BTF_ID_OR_NULL },
2990 .seq_info = &tcp_seq_info,
2993 static void __init bpf_iter_register(void)
2995 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2996 if (bpf_iter_reg_target(&tcp_reg_info))
2997 pr_warn("Warning: could not register bpf iterator tcp\n");
3002 void __init tcp_v4_init(void)
3004 if (register_pernet_subsys(&tcp_sk_ops))
3005 panic("Failed to create the TCP control socket.\n");
3007 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3008 bpf_iter_register();