net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = tcp_sk(sk)->mtu_info;
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         tp->mtu_info = info;
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk->sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk->sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 #ifdef CONFIG_TCP_MD5SIG
 659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 660 #else
 661 #define OPTION_BYTES sizeof(__be32)
 662 #endif
 663
 664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 665 {
 666         const struct tcphdr *th = tcp_hdr(skb);
 667         struct {
 668                 struct tcphdr th;
 669                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 670         } rep;
 671         struct ip_reply_arg arg;
 672 #ifdef CONFIG_TCP_MD5SIG
 673         struct tcp_md5sig_key *key = NULL;
 674         const __u8 *hash_location = NULL;
 675         unsigned char newhash[16];
 676         int genhash;
 677         struct sock *sk1 = NULL;
 678 #endif
 679         u64 transmit_time = 0;
 680         struct sock *ctl_sk;
 681         struct net *net;
 682
 683         /* Never send a reset in response to a reset. */
 684         if (th->rst)
 685                 return;
 686
 687         /* If sk not NULL, it means we did a successful lookup and incoming
 688          * route had to be correct. prequeue might have dropped our dst.
 689          */
 690         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 691                 return;
 692
 693         /* Swap the send and the receive. */
 694         memset(&rep, 0, sizeof(rep));
 695         rep.th.dest   = th->source;
 696         rep.th.source = th->dest;
 697         rep.th.doff   = sizeof(struct tcphdr) / 4;
 698         rep.th.rst    = 1;
 699
 700         if (th->ack) {
 701                 rep.th.seq = th->ack_seq;
 702         } else {
 703                 rep.th.ack = 1;
 704                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 705                                        skb->len - (th->doff << 2));
 706         }
 707
 708         memset(&arg, 0, sizeof(arg));
 709         arg.iov[0].iov_base = (unsigned char *)&rep;
 710         arg.iov[0].iov_len  = sizeof(rep.th);
 711
 712         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 713 #ifdef CONFIG_TCP_MD5SIG
 714         rcu_read_lock();
 715         hash_location = tcp_parse_md5sig_option(th);
 716         if (sk && sk_fullsock(sk)) {
 717                 const union tcp_md5_addr *addr;
 718                 int l3index;
 719
 720                 /* sdif set, means packet ingressed via a device
 721                  * in an L3 domain and inet_iif is set to it.
 722                  */
 723                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 724                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 725                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 726         } else if (hash_location) {
 727                 const union tcp_md5_addr *addr;
 728                 int sdif = tcp_v4_sdif(skb);
 729                 int dif = inet_iif(skb);
 730                 int l3index;
 731
 732                 /*
 733                  * active side is lost. Try to find listening socket through
 734                  * source port, and then find md5 key through listening socket.
 735                  * we are not loose security here:
 736                  * Incoming packet is checked with md5 hash with finding key,
 737                  * no RST generated if md5 hash doesn't match.
 738                  */
 739                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 740                                              ip_hdr(skb)->saddr,
 741                                              th->source, ip_hdr(skb)->daddr,
 742                                              ntohs(th->source), dif, sdif);
 743                 /* don't send rst if it can't find key */
 744                 if (!sk1)
 745                         goto out;
 746
 747                 /* sdif set, means packet ingressed via a device
 748                  * in an L3 domain and dif is set to it.
 749                  */
 750                 l3index = sdif ? dif : 0;
 751                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 752                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 753                 if (!key)
 754                         goto out;
 755
 756
 757                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 758                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 759                         goto out;
 760
 761         }
 762
 763         if (key) {
 764                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 765                                    (TCPOPT_NOP << 16) |
 766                                    (TCPOPT_MD5SIG << 8) |
 767                                    TCPOLEN_MD5SIG);
 768                 /* Update length and the length the header thinks exists */
 769                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 770                 rep.th.doff = arg.iov[0].iov_len / 4;
 771
 772                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 773                                      key, ip_hdr(skb)->saddr,
 774                                      ip_hdr(skb)->daddr, &rep.th);
 775         }
 776 #endif
 777         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 778         if (rep.opt[0] == 0) {
 779                 __be32 mrst = mptcp_reset_option(skb);
 780
 781                 if (mrst) {
 782                         rep.opt[0] = mrst;
 783                         arg.iov[0].iov_len += sizeof(mrst);
 784                         rep.th.doff = arg.iov[0].iov_len / 4;
 785                 }
 786         }
 787
 788         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 789                                       ip_hdr(skb)->saddr, /* XXX */
 790                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 791         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 792         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 793
 794         /* When socket is gone, all binding information is lost.
 795          * routing might fail in this case. No choice here, if we choose to force
 796          * input interface, we will misroute in case of asymmetric route.
 797          */
 798         if (sk) {
 799                 arg.bound_dev_if = sk->sk_bound_dev_if;
 800                 if (sk_fullsock(sk))
 801                         trace_tcp_send_reset(sk, skb);
 802         }
 803
 804         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 805                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 806
 807         arg.tos = ip_hdr(skb)->tos;
 808         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 809         local_bh_disable();
 810         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 811         if (sk) {
 812                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 813                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 814                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 815                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 816                 transmit_time = tcp_transmit_time(sk);
 817         }
 818         ip_send_unicast_reply(ctl_sk,
 819                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 820                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 821                               &arg, arg.iov[0].iov_len,
 822                               transmit_time);
 823
 824         ctl_sk->sk_mark = 0;
 825         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 826         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 827         local_bh_enable();
 828
 829 #ifdef CONFIG_TCP_MD5SIG
 830 out:
 831         rcu_read_unlock();
 832 #endif
 833 }
 834
 835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 836    outside socket context is ugly, certainly. What can I do?
 837  */
 838
 839 static void tcp_v4_send_ack(const struct sock *sk,
 840                             struct sk_buff *skb, u32 seq, u32 ack,
 841                             u32 win, u32 tsval, u32 tsecr, int oif,
 842                             struct tcp_md5sig_key *key,
 843                             int reply_flags, u8 tos)
 844 {
 845         const struct tcphdr *th = tcp_hdr(skb);
 846         struct {
 847                 struct tcphdr th;
 848                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 849 #ifdef CONFIG_TCP_MD5SIG
 850                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 851 #endif
 852                         ];
 853         } rep;
 854         struct net *net = sock_net(sk);
 855         struct ip_reply_arg arg;
 856         struct sock *ctl_sk;
 857         u64 transmit_time;
 858
 859         memset(&rep.th, 0, sizeof(struct tcphdr));
 860         memset(&arg, 0, sizeof(arg));
 861
 862         arg.iov[0].iov_base = (unsigned char *)&rep;
 863         arg.iov[0].iov_len  = sizeof(rep.th);
 864         if (tsecr) {
 865                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 866                                    (TCPOPT_TIMESTAMP << 8) |
 867                                    TCPOLEN_TIMESTAMP);
 868                 rep.opt[1] = htonl(tsval);
 869                 rep.opt[2] = htonl(tsecr);
 870                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 871         }
 872
 873         /* Swap the send and the receive. */
 874         rep.th.dest    = th->source;
 875         rep.th.source  = th->dest;
 876         rep.th.doff    = arg.iov[0].iov_len / 4;
 877         rep.th.seq     = htonl(seq);
 878         rep.th.ack_seq = htonl(ack);
 879         rep.th.ack     = 1;
 880         rep.th.window  = htons(win);
 881
 882 #ifdef CONFIG_TCP_MD5SIG
 883         if (key) {
 884                 int offset = (tsecr) ? 3 : 0;
 885
 886                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 887                                           (TCPOPT_NOP << 16) |
 888                                           (TCPOPT_MD5SIG << 8) |
 889                                           TCPOLEN_MD5SIG);
 890                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 891                 rep.th.doff = arg.iov[0].iov_len/4;
 892
 893                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 894                                     key, ip_hdr(skb)->saddr,
 895                                     ip_hdr(skb)->daddr, &rep.th);
 896         }
 897 #endif
 898         arg.flags = reply_flags;
 899         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 900                                       ip_hdr(skb)->saddr, /* XXX */
 901                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 902         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 903         if (oif)
 904                 arg.bound_dev_if = oif;
 905         arg.tos = tos;
 906         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 907         local_bh_disable();
 908         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 909         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 910                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 911         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 912                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 913         transmit_time = tcp_transmit_time(sk);
 914         ip_send_unicast_reply(ctl_sk,
 915                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 916                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 917                               &arg, arg.iov[0].iov_len,
 918                               transmit_time);
 919
 920         ctl_sk->sk_mark = 0;
 921         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 922         local_bh_enable();
 923 }
 924
 925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 926 {
 927         struct inet_timewait_sock *tw = inet_twsk(sk);
 928         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 929
 930         tcp_v4_send_ack(sk, skb,
 931                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 932                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 933                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 934                         tcptw->tw_ts_recent,
 935                         tw->tw_bound_dev_if,
 936                         tcp_twsk_md5_key(tcptw),
 937                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 938                         tw->tw_tos
 939                         );
 940
 941         inet_twsk_put(tw);
 942 }
 943
 944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 945                                   struct request_sock *req)
 946 {
 947         const union tcp_md5_addr *addr;
 948         int l3index;
 949
 950         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 951          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 952          */
 953         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 954                                              tcp_sk(sk)->snd_nxt;
 955
 956         /* RFC 7323 2.3
 957          * The window field (SEG.WND) of every outgoing segment, with the
 958          * exception of <SYN> segments, MUST be right-shifted by
 959          * Rcv.Wind.Shift bits:
 960          */
 961         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 962         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 963         tcp_v4_send_ack(sk, skb, seq,
 964                         tcp_rsk(req)->rcv_nxt,
 965                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 966                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 967                         req->ts_recent,
 968                         0,
 969                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 970                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 971                         ip_hdr(skb)->tos);
 972 }
 973
 974 /*
 975  *      Send a SYN-ACK after having received a SYN.
 976  *      This still operates on a request_sock only, not on a big
 977  *      socket.
 978  */
 979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 980                               struct flowi *fl,
 981                               struct request_sock *req,
 982                               struct tcp_fastopen_cookie *foc,
 983                               enum tcp_synack_type synack_type,
 984                               struct sk_buff *syn_skb)
 985 {
 986         const struct inet_request_sock *ireq = inet_rsk(req);
 987         struct flowi4 fl4;
 988         int err = -1;
 989         struct sk_buff *skb;
 990         u8 tos;
 991
 992         /* First, grab a route. */
 993         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 994                 return -1;
 995
 996         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 997
 998         if (skb) {
 999                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004                                 inet_sk(sk)->tos;
1005
1006                 if (!INET_ECN_is_capable(tos) &&
1007                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1008                         tos |= INET_ECN_ECT_0;
1009
1010                 rcu_read_lock();
1011                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012                                             ireq->ir_rmt_addr,
1013                                             rcu_dereference(ireq->ireq_opt),
1014                                             tos);
1015                 rcu_read_unlock();
1016                 err = net_xmit_eval(err);
1017         }
1018
1019         return err;
1020 }
1021
1022 /*
1023  *      IPv4 request_sock destructor.
1024  */
1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026 {
1027         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028 }
1029
1030 #ifdef CONFIG_TCP_MD5SIG
1031 /*
1032  * RFC2385 MD5 checksumming requires a mapping of
1033  * IP address->MD5 Key.
1034  * We need to maintain these in the sk structure.
1035  */
1036
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1039
1040 /* Find the Key structure for an address.  */
1041 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1042                                            const union tcp_md5_addr *addr,
1043                                            int family)
1044 {
1045         const struct tcp_sock *tp = tcp_sk(sk);
1046         struct tcp_md5sig_key *key;
1047         const struct tcp_md5sig_info *md5sig;
1048         __be32 mask;
1049         struct tcp_md5sig_key *best_match = NULL;
1050         bool match;
1051
1052         /* caller either holds rcu_read_lock() or socket lock */
1053         md5sig = rcu_dereference_check(tp->md5sig_info,
1054                                        lockdep_sock_is_held(sk));
1055         if (!md5sig)
1056                 return NULL;
1057
1058         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1059                                  lockdep_sock_is_held(sk)) {
1060                 if (key->family != family)
1061                         continue;
1062                 if (key->l3index && key->l3index != l3index)
1063                         continue;
1064                 if (family == AF_INET) {
1065                         mask = inet_make_mask(key->prefixlen);
1066                         match = (key->addr.a4.s_addr & mask) ==
1067                                 (addr->a4.s_addr & mask);
1068 #if IS_ENABLED(CONFIG_IPV6)
1069                 } else if (family == AF_INET6) {
1070                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1071                                                   key->prefixlen);
1072 #endif
1073                 } else {
1074                         match = false;
1075                 }
1076
1077                 if (match && (!best_match ||
1078                               key->prefixlen > best_match->prefixlen))
1079                         best_match = key;
1080         }
1081         return best_match;
1082 }
1083 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084
1085 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086                                                       const union tcp_md5_addr *addr,
1087                                                       int family, u8 prefixlen,
1088                                                       int l3index)
1089 {
1090         const struct tcp_sock *tp = tcp_sk(sk);
1091         struct tcp_md5sig_key *key;
1092         unsigned int size = sizeof(struct in_addr);
1093         const struct tcp_md5sig_info *md5sig;
1094
1095         /* caller either holds rcu_read_lock() or socket lock */
1096         md5sig = rcu_dereference_check(tp->md5sig_info,
1097                                        lockdep_sock_is_held(sk));
1098         if (!md5sig)
1099                 return NULL;
1100 #if IS_ENABLED(CONFIG_IPV6)
1101         if (family == AF_INET6)
1102                 size = sizeof(struct in6_addr);
1103 #endif
1104         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105                                  lockdep_sock_is_held(sk)) {
1106                 if (key->family != family)
1107                         continue;
1108                 if (key->l3index && key->l3index != l3index)
1109                         continue;
1110                 if (!memcmp(&key->addr, addr, size) &&
1111                     key->prefixlen == prefixlen)
1112                         return key;
1113         }
1114         return NULL;
1115 }
1116
1117 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118                                          const struct sock *addr_sk)
1119 {
1120         const union tcp_md5_addr *addr;
1121         int l3index;
1122
1123         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124                                                  addr_sk->sk_bound_dev_if);
1125         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129
1130 /* This can be called on a newly created socket, from other files */
1131 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132                    int family, u8 prefixlen, int l3index,
1133                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134 {
1135         /* Add Key to the list */
1136         struct tcp_md5sig_key *key;
1137         struct tcp_sock *tp = tcp_sk(sk);
1138         struct tcp_md5sig_info *md5sig;
1139
1140         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141         if (key) {
1142                 /* Pre-existing entry - just update that one.
1143                  * Note that the key might be used concurrently.
1144                  * data_race() is telling kcsan that we do not care of
1145                  * key mismatches, since changing MD5 key on live flows
1146                  * can lead to packet drops.
1147                  */
1148                 data_race(memcpy(key->key, newkey, newkeylen));
1149
1150                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1151                  * Also note that a reader could catch new key->keylen value
1152                  * but old key->key[], this is the reason we use __GFP_ZERO
1153                  * at sock_kmalloc() time below these lines.
1154                  */
1155                 WRITE_ONCE(key->keylen, newkeylen);
1156
1157                 return 0;
1158         }
1159
1160         md5sig = rcu_dereference_protected(tp->md5sig_info,
1161                                            lockdep_sock_is_held(sk));
1162         if (!md5sig) {
1163                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1164                 if (!md5sig)
1165                         return -ENOMEM;
1166
1167                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168                 INIT_HLIST_HEAD(&md5sig->head);
1169                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1170         }
1171
1172         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173         if (!key)
1174                 return -ENOMEM;
1175         if (!tcp_alloc_md5sig_pool()) {
1176                 sock_kfree_s(sk, key, sizeof(*key));
1177                 return -ENOMEM;
1178         }
1179
1180         memcpy(key->key, newkey, newkeylen);
1181         key->keylen = newkeylen;
1182         key->family = family;
1183         key->prefixlen = prefixlen;
1184         key->l3index = l3index;
1185         memcpy(&key->addr, addr,
1186                (family == AF_INET6) ? sizeof(struct in6_addr) :
1187                                       sizeof(struct in_addr));
1188         hlist_add_head_rcu(&key->node, &md5sig->head);
1189         return 0;
1190 }
1191 EXPORT_SYMBOL(tcp_md5_do_add);
1192
1193 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194                    u8 prefixlen, int l3index)
1195 {
1196         struct tcp_md5sig_key *key;
1197
1198         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199         if (!key)
1200                 return -ENOENT;
1201         hlist_del_rcu(&key->node);
1202         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1203         kfree_rcu(key, rcu);
1204         return 0;
1205 }
1206 EXPORT_SYMBOL(tcp_md5_do_del);
1207
1208 static void tcp_clear_md5_list(struct sock *sk)
1209 {
1210         struct tcp_sock *tp = tcp_sk(sk);
1211         struct tcp_md5sig_key *key;
1212         struct hlist_node *n;
1213         struct tcp_md5sig_info *md5sig;
1214
1215         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216
1217         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1218                 hlist_del_rcu(&key->node);
1219                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1220                 kfree_rcu(key, rcu);
1221         }
1222 }
1223
1224 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225                                  sockptr_t optval, int optlen)
1226 {
1227         struct tcp_md5sig cmd;
1228         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229         const union tcp_md5_addr *addr;
1230         u8 prefixlen = 32;
1231         int l3index = 0;
1232
1233         if (optlen < sizeof(cmd))
1234                 return -EINVAL;
1235
1236         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237                 return -EFAULT;
1238
1239         if (sin->sin_family != AF_INET)
1240                 return -EINVAL;
1241
1242         if (optname == TCP_MD5SIG_EXT &&
1243             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244                 prefixlen = cmd.tcpm_prefixlen;
1245                 if (prefixlen > 32)
1246                         return -EINVAL;
1247         }
1248
1249         if (optname == TCP_MD5SIG_EXT &&
1250             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251                 struct net_device *dev;
1252
1253                 rcu_read_lock();
1254                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255                 if (dev && netif_is_l3_master(dev))
1256                         l3index = dev->ifindex;
1257
1258                 rcu_read_unlock();
1259
1260                 /* ok to reference set/not set outside of rcu;
1261                  * right now device MUST be an L3 master
1262                  */
1263                 if (!dev || !l3index)
1264                         return -EINVAL;
1265         }
1266
1267         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
1269         if (!cmd.tcpm_keylen)
1270                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271
1272         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273                 return -EINVAL;
1274
1275         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277 }
1278
1279 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280                                    __be32 daddr, __be32 saddr,
1281                                    const struct tcphdr *th, int nbytes)
1282 {
1283         struct tcp4_pseudohdr *bp;
1284         struct scatterlist sg;
1285         struct tcphdr *_th;
1286
1287         bp = hp->scratch;
1288         bp->saddr = saddr;
1289         bp->daddr = daddr;
1290         bp->pad = 0;
1291         bp->protocol = IPPROTO_TCP;
1292         bp->len = cpu_to_be16(nbytes);
1293
1294         _th = (struct tcphdr *)(bp + 1);
1295         memcpy(_th, th, sizeof(*th));
1296         _th->check = 0;
1297
1298         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300                                 sizeof(*bp) + sizeof(*th));
1301         return crypto_ahash_update(hp->md5_req);
1302 }
1303
1304 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1305                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306 {
1307         struct tcp_md5sig_pool *hp;
1308         struct ahash_request *req;
1309
1310         hp = tcp_get_md5sig_pool();
1311         if (!hp)
1312                 goto clear_hash_noput;
1313         req = hp->md5_req;
1314
1315         if (crypto_ahash_init(req))
1316                 goto clear_hash;
1317         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318                 goto clear_hash;
1319         if (tcp_md5_hash_key(hp, key))
1320                 goto clear_hash;
1321         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322         if (crypto_ahash_final(req))
1323                 goto clear_hash;
1324
1325         tcp_put_md5sig_pool();
1326         return 0;
1327
1328 clear_hash:
1329         tcp_put_md5sig_pool();
1330 clear_hash_noput:
1331         memset(md5_hash, 0, 16);
1332         return 1;
1333 }
1334
1335 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336                         const struct sock *sk,
1337                         const struct sk_buff *skb)
1338 {
1339         struct tcp_md5sig_pool *hp;
1340         struct ahash_request *req;
1341         const struct tcphdr *th = tcp_hdr(skb);
1342         __be32 saddr, daddr;
1343
1344         if (sk) { /* valid for establish/request sockets */
1345                 saddr = sk->sk_rcv_saddr;
1346                 daddr = sk->sk_daddr;
1347         } else {
1348                 const struct iphdr *iph = ip_hdr(skb);
1349                 saddr = iph->saddr;
1350                 daddr = iph->daddr;
1351         }
1352
1353         hp = tcp_get_md5sig_pool();
1354         if (!hp)
1355                 goto clear_hash_noput;
1356         req = hp->md5_req;
1357
1358         if (crypto_ahash_init(req))
1359                 goto clear_hash;
1360
1361         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362                 goto clear_hash;
1363         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364                 goto clear_hash;
1365         if (tcp_md5_hash_key(hp, key))
1366                 goto clear_hash;
1367         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368         if (crypto_ahash_final(req))
1369                 goto clear_hash;
1370
1371         tcp_put_md5sig_pool();
1372         return 0;
1373
1374 clear_hash:
1375         tcp_put_md5sig_pool();
1376 clear_hash_noput:
1377         memset(md5_hash, 0, 16);
1378         return 1;
1379 }
1380 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381
1382 #endif
1383
1384 /* Called with rcu_read_lock() */
1385 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386                                     const struct sk_buff *skb,
1387                                     int dif, int sdif)
1388 {
1389 #ifdef CONFIG_TCP_MD5SIG
1390         /*
1391          * This gets called for each TCP segment that arrives
1392          * so we want to be efficient.
1393          * We have 3 drop cases:
1394          * o No MD5 hash and one expected.
1395          * o MD5 hash and we're not expecting one.
1396          * o MD5 hash and its wrong.
1397          */
1398         const __u8 *hash_location = NULL;
1399         struct tcp_md5sig_key *hash_expected;
1400         const struct iphdr *iph = ip_hdr(skb);
1401         const struct tcphdr *th = tcp_hdr(skb);
1402         const union tcp_md5_addr *addr;
1403         unsigned char newhash[16];
1404         int genhash, l3index;
1405
1406         /* sdif set, means packet ingressed via a device
1407          * in an L3 domain and dif is set to the l3mdev
1408          */
1409         l3index = sdif ? dif : 0;
1410
1411         addr = (union tcp_md5_addr *)&iph->saddr;
1412         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413         hash_location = tcp_parse_md5sig_option(th);
1414
1415         /* We've parsed the options - do we have a hash? */
1416         if (!hash_expected && !hash_location)
1417                 return false;
1418
1419         if (hash_expected && !hash_location) {
1420                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1421                 return true;
1422         }
1423
1424         if (!hash_expected && hash_location) {
1425                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1426                 return true;
1427         }
1428
1429         /* Okay, so this is hash_expected and hash_location -
1430          * so we need to calculate the checksum.
1431          */
1432         genhash = tcp_v4_md5_hash_skb(newhash,
1433                                       hash_expected,
1434                                       NULL, skb);
1435
1436         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439                                      &iph->saddr, ntohs(th->source),
1440                                      &iph->daddr, ntohs(th->dest),
1441                                      genhash ? " tcp_v4_calc_md5_hash failed"
1442                                      : "", l3index);
1443                 return true;
1444         }
1445         return false;
1446 #endif
1447         return false;
1448 }
1449
1450 static void tcp_v4_init_req(struct request_sock *req,
1451                             const struct sock *sk_listener,
1452                             struct sk_buff *skb)
1453 {
1454         struct inet_request_sock *ireq = inet_rsk(req);
1455         struct net *net = sock_net(sk_listener);
1456
1457         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1459         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460 }
1461
1462 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463                                           struct sk_buff *skb,
1464                                           struct flowi *fl,
1465                                           struct request_sock *req)
1466 {
1467         tcp_v4_init_req(req, sk, skb);
1468
1469         if (security_inet_conn_request(sk, skb, req))
1470                 return NULL;
1471
1472         return inet_csk_route_req(sk, &fl->u.ip4, req);
1473 }
1474
1475 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1476         .family         =       PF_INET,
1477         .obj_size       =       sizeof(struct tcp_request_sock),
1478         .rtx_syn_ack    =       tcp_rtx_synack,
1479         .send_ack       =       tcp_v4_reqsk_send_ack,
1480         .destructor     =       tcp_v4_reqsk_destructor,
1481         .send_reset     =       tcp_v4_send_reset,
1482         .syn_ack_timeout =      tcp_syn_ack_timeout,
1483 };
1484
1485 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1486         .mss_clamp      =       TCP_MSS_DEFAULT,
1487 #ifdef CONFIG_TCP_MD5SIG
1488         .req_md5_lookup =       tcp_v4_md5_lookup,
1489         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1490 #endif
1491 #ifdef CONFIG_SYN_COOKIES
1492         .cookie_init_seq =      cookie_v4_init_sequence,
1493 #endif
1494         .route_req      =       tcp_v4_route_req,
1495         .init_seq       =       tcp_v4_init_seq,
1496         .init_ts_off    =       tcp_v4_init_ts_off,
1497         .send_synack    =       tcp_v4_send_synack,
1498 };
1499
1500 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1501 {
1502         /* Never answer to SYNs send to broadcast or multicast */
1503         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1504                 goto drop;
1505
1506         return tcp_conn_request(&tcp_request_sock_ops,
1507                                 &tcp_request_sock_ipv4_ops, sk, skb);
1508
1509 drop:
1510         tcp_listendrop(sk);
1511         return 0;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_conn_request);
1514
1515
1516 /*
1517  * The three way handshake has completed - we got a valid synack -
1518  * now create the new socket.
1519  */
1520 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1521                                   struct request_sock *req,
1522                                   struct dst_entry *dst,
1523                                   struct request_sock *req_unhash,
1524                                   bool *own_req)
1525 {
1526         struct inet_request_sock *ireq;
1527         bool found_dup_sk = false;
1528         struct inet_sock *newinet;
1529         struct tcp_sock *newtp;
1530         struct sock *newsk;
1531 #ifdef CONFIG_TCP_MD5SIG
1532         const union tcp_md5_addr *addr;
1533         struct tcp_md5sig_key *key;
1534         int l3index;
1535 #endif
1536         struct ip_options_rcu *inet_opt;
1537
1538         if (sk_acceptq_is_full(sk))
1539                 goto exit_overflow;
1540
1541         newsk = tcp_create_openreq_child(sk, req, skb);
1542         if (!newsk)
1543                 goto exit_nonewsk;
1544
1545         newsk->sk_gso_type = SKB_GSO_TCPV4;
1546         inet_sk_rx_dst_set(newsk, skb);
1547
1548         newtp                 = tcp_sk(newsk);
1549         newinet               = inet_sk(newsk);
1550         ireq                  = inet_rsk(req);
1551         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1552         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1553         newsk->sk_bound_dev_if = ireq->ir_iif;
1554         newinet->inet_saddr   = ireq->ir_loc_addr;
1555         inet_opt              = rcu_dereference(ireq->ireq_opt);
1556         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1557         newinet->mc_index     = inet_iif(skb);
1558         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1559         newinet->rcv_tos      = ip_hdr(skb)->tos;
1560         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1561         if (inet_opt)
1562                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1563         newinet->inet_id = prandom_u32();
1564
1565         /* Set ToS of the new socket based upon the value of incoming SYN.
1566          * ECT bits are set later in tcp_init_transfer().
1567          */
1568         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1569                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1570
1571         if (!dst) {
1572                 dst = inet_csk_route_child_sock(sk, newsk, req);
1573                 if (!dst)
1574                         goto put_and_exit;
1575         } else {
1576                 /* syncookie case : see end of cookie_v4_check() */
1577         }
1578         sk_setup_caps(newsk, dst);
1579
1580         tcp_ca_openreq_child(newsk, dst);
1581
1582         tcp_sync_mss(newsk, dst_mtu(dst));
1583         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1584
1585         tcp_initialize_rcv_mss(newsk);
1586
1587 #ifdef CONFIG_TCP_MD5SIG
1588         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1589         /* Copy over the MD5 key from the original socket */
1590         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1591         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1592         if (key) {
1593                 /*
1594                  * We're using one, so create a matching key
1595                  * on the newsk structure. If we fail to get
1596                  * memory, then we end up not copying the key
1597                  * across. Shucks.
1598                  */
1599                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1600                                key->key, key->keylen, GFP_ATOMIC);
1601                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1602         }
1603 #endif
1604
1605         if (__inet_inherit_port(sk, newsk) < 0)
1606                 goto put_and_exit;
1607         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1608                                        &found_dup_sk);
1609         if (likely(*own_req)) {
1610                 tcp_move_syn(newtp, req);
1611                 ireq->ireq_opt = NULL;
1612         } else {
1613                 newinet->inet_opt = NULL;
1614
1615                 if (!req_unhash && found_dup_sk) {
1616                         /* This code path should only be executed in the
1617                          * syncookie case only
1618                          */
1619                         bh_unlock_sock(newsk);
1620                         sock_put(newsk);
1621                         newsk = NULL;
1622                 }
1623         }
1624         return newsk;
1625
1626 exit_overflow:
1627         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1628 exit_nonewsk:
1629         dst_release(dst);
1630 exit:
1631         tcp_listendrop(sk);
1632         return NULL;
1633 put_and_exit:
1634         newinet->inet_opt = NULL;
1635         inet_csk_prepare_forced_close(newsk);
1636         tcp_done(newsk);
1637         goto exit;
1638 }
1639 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1640
1641 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1642 {
1643 #ifdef CONFIG_SYN_COOKIES
1644         const struct tcphdr *th = tcp_hdr(skb);
1645
1646         if (!th->syn)
1647                 sk = cookie_v4_check(sk, skb);
1648 #endif
1649         return sk;
1650 }
1651
1652 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1653                          struct tcphdr *th, u32 *cookie)
1654 {
1655         u16 mss = 0;
1656 #ifdef CONFIG_SYN_COOKIES
1657         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1658                                     &tcp_request_sock_ipv4_ops, sk, th);
1659         if (mss) {
1660                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1661                 tcp_synq_overflow(sk);
1662         }
1663 #endif
1664         return mss;
1665 }
1666
1667 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1668                                                            u32));
1669 /* The socket must have it's spinlock held when we get
1670  * here, unless it is a TCP_LISTEN socket.
1671  *
1672  * We have a potential double-lock case here, so even when
1673  * doing backlog processing we use the BH locking scheme.
1674  * This is because we cannot sleep with the original spinlock
1675  * held.
1676  */
1677 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1678 {
1679         struct sock *rsk;
1680
1681         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1682                 struct dst_entry *dst = sk->sk_rx_dst;
1683
1684                 sock_rps_save_rxhash(sk, skb);
1685                 sk_mark_napi_id(sk, skb);
1686                 if (dst) {
1687                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1688                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1689                                              dst, 0)) {
1690                                 dst_release(dst);
1691                                 sk->sk_rx_dst = NULL;
1692                         }
1693                 }
1694                 tcp_rcv_established(sk, skb);
1695                 return 0;
1696         }
1697
1698         if (tcp_checksum_complete(skb))
1699                 goto csum_err;
1700
1701         if (sk->sk_state == TCP_LISTEN) {
1702                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1703
1704                 if (!nsk)
1705                         goto discard;
1706                 if (nsk != sk) {
1707                         if (tcp_child_process(sk, nsk, skb)) {
1708                                 rsk = nsk;
1709                                 goto reset;
1710                         }
1711                         return 0;
1712                 }
1713         } else
1714                 sock_rps_save_rxhash(sk, skb);
1715
1716         if (tcp_rcv_state_process(sk, skb)) {
1717                 rsk = sk;
1718                 goto reset;
1719         }
1720         return 0;
1721
1722 reset:
1723         tcp_v4_send_reset(rsk, skb);
1724 discard:
1725         kfree_skb(skb);
1726         /* Be careful here. If this function gets more complicated and
1727          * gcc suffers from register pressure on the x86, sk (in %ebx)
1728          * might be destroyed here. This current version compiles correctly,
1729          * but you have been warned.
1730          */
1731         return 0;
1732
1733 csum_err:
1734         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1735         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1736         goto discard;
1737 }
1738 EXPORT_SYMBOL(tcp_v4_do_rcv);
1739
1740 int tcp_v4_early_demux(struct sk_buff *skb)
1741 {
1742         const struct iphdr *iph;
1743         const struct tcphdr *th;
1744         struct sock *sk;
1745
1746         if (skb->pkt_type != PACKET_HOST)
1747                 return 0;
1748
1749         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1750                 return 0;
1751
1752         iph = ip_hdr(skb);
1753         th = tcp_hdr(skb);
1754
1755         if (th->doff < sizeof(struct tcphdr) / 4)
1756                 return 0;
1757
1758         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1759                                        iph->saddr, th->source,
1760                                        iph->daddr, ntohs(th->dest),
1761                                        skb->skb_iif, inet_sdif(skb));
1762         if (sk) {
1763                 skb->sk = sk;
1764                 skb->destructor = sock_edemux;
1765                 if (sk_fullsock(sk)) {
1766                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1767
1768                         if (dst)
1769                                 dst = dst_check(dst, 0);
1770                         if (dst &&
1771                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1772                                 skb_dst_set_noref(skb, dst);
1773                 }
1774         }
1775         return 0;
1776 }
1777
1778 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1779 {
1780         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1781         u32 tail_gso_size, tail_gso_segs;
1782         struct skb_shared_info *shinfo;
1783         const struct tcphdr *th;
1784         struct tcphdr *thtail;
1785         struct sk_buff *tail;
1786         unsigned int hdrlen;
1787         bool fragstolen;
1788         u32 gso_segs;
1789         u32 gso_size;
1790         int delta;
1791
1792         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1793          * we can fix skb->truesize to its real value to avoid future drops.
1794          * This is valid because skb is not yet charged to the socket.
1795          * It has been noticed pure SACK packets were sometimes dropped
1796          * (if cooked by drivers without copybreak feature).
1797          */
1798         skb_condense(skb);
1799
1800         skb_dst_drop(skb);
1801
1802         if (unlikely(tcp_checksum_complete(skb))) {
1803                 bh_unlock_sock(sk);
1804                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1805                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1806                 return true;
1807         }
1808
1809         /* Attempt coalescing to last skb in backlog, even if we are
1810          * above the limits.
1811          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1812          */
1813         th = (const struct tcphdr *)skb->data;
1814         hdrlen = th->doff * 4;
1815
1816         tail = sk->sk_backlog.tail;
1817         if (!tail)
1818                 goto no_coalesce;
1819         thtail = (struct tcphdr *)tail->data;
1820
1821         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1822             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1823             ((TCP_SKB_CB(tail)->tcp_flags |
1824               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1825             !((TCP_SKB_CB(tail)->tcp_flags &
1826               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1827             ((TCP_SKB_CB(tail)->tcp_flags ^
1828               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1829 #ifdef CONFIG_TLS_DEVICE
1830             tail->decrypted != skb->decrypted ||
1831 #endif
1832             thtail->doff != th->doff ||
1833             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1834                 goto no_coalesce;
1835
1836         __skb_pull(skb, hdrlen);
1837
1838         shinfo = skb_shinfo(skb);
1839         gso_size = shinfo->gso_size ?: skb->len;
1840         gso_segs = shinfo->gso_segs ?: 1;
1841
1842         shinfo = skb_shinfo(tail);
1843         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1844         tail_gso_segs = shinfo->gso_segs ?: 1;
1845
1846         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1847                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1848
1849                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1850                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1851                         thtail->window = th->window;
1852                 }
1853
1854                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1855                  * thtail->fin, so that the fast path in tcp_rcv_established()
1856                  * is not entered if we append a packet with a FIN.
1857                  * SYN, RST, URG are not present.
1858                  * ACK is set on both packets.
1859                  * PSH : we do not really care in TCP stack,
1860                  *       at least for 'GRO' packets.
1861                  */
1862                 thtail->fin |= th->fin;
1863                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1864
1865                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1866                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1867                         tail->tstamp = skb->tstamp;
1868                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1869                 }
1870
1871                 /* Not as strict as GRO. We only need to carry mss max value */
1872                 shinfo->gso_size = max(gso_size, tail_gso_size);
1873                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1874
1875                 sk->sk_backlog.len += delta;
1876                 __NET_INC_STATS(sock_net(sk),
1877                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1878                 kfree_skb_partial(skb, fragstolen);
1879                 return false;
1880         }
1881         __skb_push(skb, hdrlen);
1882
1883 no_coalesce:
1884         /* Only socket owner can try to collapse/prune rx queues
1885          * to reduce memory overhead, so add a little headroom here.
1886          * Few sockets backlog are possibly concurrently non empty.
1887          */
1888         limit += 64*1024;
1889
1890         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1891                 bh_unlock_sock(sk);
1892                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1893                 return true;
1894         }
1895         return false;
1896 }
1897 EXPORT_SYMBOL(tcp_add_backlog);
1898
1899 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1900 {
1901         struct tcphdr *th = (struct tcphdr *)skb->data;
1902
1903         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1904 }
1905 EXPORT_SYMBOL(tcp_filter);
1906
1907 static void tcp_v4_restore_cb(struct sk_buff *skb)
1908 {
1909         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1910                 sizeof(struct inet_skb_parm));
1911 }
1912
1913 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1914                            const struct tcphdr *th)
1915 {
1916         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1917          * barrier() makes sure compiler wont play fool^Waliasing games.
1918          */
1919         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1920                 sizeof(struct inet_skb_parm));
1921         barrier();
1922
1923         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1924         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1925                                     skb->len - th->doff * 4);
1926         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1927         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1928         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1929         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1930         TCP_SKB_CB(skb)->sacked  = 0;
1931         TCP_SKB_CB(skb)->has_rxtstamp =
1932                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1933 }
1934
1935 /*
1936  *      From tcp_input.c
1937  */
1938
1939 int tcp_v4_rcv(struct sk_buff *skb)
1940 {
1941         struct net *net = dev_net(skb->dev);
1942         struct sk_buff *skb_to_free;
1943         int sdif = inet_sdif(skb);
1944         int dif = inet_iif(skb);
1945         const struct iphdr *iph;
1946         const struct tcphdr *th;
1947         bool refcounted;
1948         struct sock *sk;
1949         int ret;
1950
1951         if (skb->pkt_type != PACKET_HOST)
1952                 goto discard_it;
1953
1954         /* Count it even if it's bad */
1955         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1956
1957         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1958                 goto discard_it;
1959
1960         th = (const struct tcphdr *)skb->data;
1961
1962         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1963                 goto bad_packet;
1964         if (!pskb_may_pull(skb, th->doff * 4))
1965                 goto discard_it;
1966
1967         /* An explanation is required here, I think.
1968          * Packet length and doff are validated by header prediction,
1969          * provided case of th->doff==0 is eliminated.
1970          * So, we defer the checks. */
1971
1972         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1973                 goto csum_error;
1974
1975         th = (const struct tcphdr *)skb->data;
1976         iph = ip_hdr(skb);
1977 lookup:
1978         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1979                                th->dest, sdif, &refcounted);
1980         if (!sk)
1981                 goto no_tcp_socket;
1982
1983 process:
1984         if (sk->sk_state == TCP_TIME_WAIT)
1985                 goto do_time_wait;
1986
1987         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1988                 struct request_sock *req = inet_reqsk(sk);
1989                 bool req_stolen = false;
1990                 struct sock *nsk;
1991
1992                 sk = req->rsk_listener;
1993                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1994                         sk_drops_add(sk, skb);
1995                         reqsk_put(req);
1996                         goto discard_it;
1997                 }
1998                 if (tcp_checksum_complete(skb)) {
1999                         reqsk_put(req);
2000                         goto csum_error;
2001                 }
2002                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2003                         inet_csk_reqsk_queue_drop_and_put(sk, req);
2004                         goto lookup;
2005                 }
2006                 /* We own a reference on the listener, increase it again
2007                  * as we might lose it too soon.
2008                  */
2009                 sock_hold(sk);
2010                 refcounted = true;
2011                 nsk = NULL;
2012                 if (!tcp_filter(sk, skb)) {
2013                         th = (const struct tcphdr *)skb->data;
2014                         iph = ip_hdr(skb);
2015                         tcp_v4_fill_cb(skb, iph, th);
2016                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2017                 }
2018                 if (!nsk) {
2019                         reqsk_put(req);
2020                         if (req_stolen) {
2021                                 /* Another cpu got exclusive access to req
2022                                  * and created a full blown socket.
2023                                  * Try to feed this packet to this socket
2024                                  * instead of discarding it.
2025                                  */
2026                                 tcp_v4_restore_cb(skb);
2027                                 sock_put(sk);
2028                                 goto lookup;
2029                         }
2030                         goto discard_and_relse;
2031                 }
2032                 if (nsk == sk) {
2033                         reqsk_put(req);
2034                         tcp_v4_restore_cb(skb);
2035                 } else if (tcp_child_process(sk, nsk, skb)) {
2036                         tcp_v4_send_reset(nsk, skb);
2037                         goto discard_and_relse;
2038                 } else {
2039                         sock_put(sk);
2040                         return 0;
2041                 }
2042         }
2043         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2044                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2045                 goto discard_and_relse;
2046         }
2047
2048         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2049                 goto discard_and_relse;
2050
2051         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2052                 goto discard_and_relse;
2053
2054         nf_reset_ct(skb);
2055
2056         if (tcp_filter(sk, skb))
2057                 goto discard_and_relse;
2058         th = (const struct tcphdr *)skb->data;
2059         iph = ip_hdr(skb);
2060         tcp_v4_fill_cb(skb, iph, th);
2061
2062         skb->dev = NULL;
2063
2064         if (sk->sk_state == TCP_LISTEN) {
2065                 ret = tcp_v4_do_rcv(sk, skb);
2066                 goto put_and_return;
2067         }
2068
2069         sk_incoming_cpu_update(sk);
2070
2071         bh_lock_sock_nested(sk);
2072         tcp_segs_in(tcp_sk(sk), skb);
2073         ret = 0;
2074         if (!sock_owned_by_user(sk)) {
2075                 skb_to_free = sk->sk_rx_skb_cache;
2076                 sk->sk_rx_skb_cache = NULL;
2077                 ret = tcp_v4_do_rcv(sk, skb);
2078         } else {
2079                 if (tcp_add_backlog(sk, skb))
2080                         goto discard_and_relse;
2081                 skb_to_free = NULL;
2082         }
2083         bh_unlock_sock(sk);
2084         if (skb_to_free)
2085                 __kfree_skb(skb_to_free);
2086
2087 put_and_return:
2088         if (refcounted)
2089                 sock_put(sk);
2090
2091         return ret;
2092
2093 no_tcp_socket:
2094         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2095                 goto discard_it;
2096
2097         tcp_v4_fill_cb(skb, iph, th);
2098
2099         if (tcp_checksum_complete(skb)) {
2100 csum_error:
2101                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2102 bad_packet:
2103                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2104         } else {
2105                 tcp_v4_send_reset(NULL, skb);
2106         }
2107
2108 discard_it:
2109         /* Discard frame. */
2110         kfree_skb(skb);
2111         return 0;
2112
2113 discard_and_relse:
2114         sk_drops_add(sk, skb);
2115         if (refcounted)
2116                 sock_put(sk);
2117         goto discard_it;
2118
2119 do_time_wait:
2120         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2121                 inet_twsk_put(inet_twsk(sk));
2122                 goto discard_it;
2123         }
2124
2125         tcp_v4_fill_cb(skb, iph, th);
2126
2127         if (tcp_checksum_complete(skb)) {
2128                 inet_twsk_put(inet_twsk(sk));
2129                 goto csum_error;
2130         }
2131         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2132         case TCP_TW_SYN: {
2133                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2134                                                         &tcp_hashinfo, skb,
2135                                                         __tcp_hdrlen(th),
2136                                                         iph->saddr, th->source,
2137                                                         iph->daddr, th->dest,
2138                                                         inet_iif(skb),
2139                                                         sdif);
2140                 if (sk2) {
2141                         inet_twsk_deschedule_put(inet_twsk(sk));
2142                         sk = sk2;
2143                         tcp_v4_restore_cb(skb);
2144                         refcounted = false;
2145                         goto process;
2146                 }
2147         }
2148                 /* to ACK */
2149                 fallthrough;
2150         case TCP_TW_ACK:
2151                 tcp_v4_timewait_ack(sk, skb);
2152                 break;
2153         case TCP_TW_RST:
2154                 tcp_v4_send_reset(sk, skb);
2155                 inet_twsk_deschedule_put(inet_twsk(sk));
2156                 goto discard_it;
2157         case TCP_TW_SUCCESS:;
2158         }
2159         goto discard_it;
2160 }
2161
2162 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2163         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2164         .twsk_unique    = tcp_twsk_unique,
2165         .twsk_destructor= tcp_twsk_destructor,
2166 };
2167
2168 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2169 {
2170         struct dst_entry *dst = skb_dst(skb);
2171
2172         if (dst && dst_hold_safe(dst)) {
2173                 sk->sk_rx_dst = dst;
2174                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2175         }
2176 }
2177 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2178
2179 const struct inet_connection_sock_af_ops ipv4_specific = {
2180         .queue_xmit        = ip_queue_xmit,
2181         .send_check        = tcp_v4_send_check,
2182         .rebuild_header    = inet_sk_rebuild_header,
2183         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2184         .conn_request      = tcp_v4_conn_request,
2185         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2186         .net_header_len    = sizeof(struct iphdr),
2187         .setsockopt        = ip_setsockopt,
2188         .getsockopt        = ip_getsockopt,
2189         .addr2sockaddr     = inet_csk_addr2sockaddr,
2190         .sockaddr_len      = sizeof(struct sockaddr_in),
2191         .mtu_reduced       = tcp_v4_mtu_reduced,
2192 };
2193 EXPORT_SYMBOL(ipv4_specific);
2194
2195 #ifdef CONFIG_TCP_MD5SIG
2196 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2197         .md5_lookup             = tcp_v4_md5_lookup,
2198         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2199         .md5_parse              = tcp_v4_parse_md5_keys,
2200 };
2201 #endif
2202
2203 /* NOTE: A lot of things set to zero explicitly by call to
2204  *       sk_alloc() so need not be done here.
2205  */
2206 static int tcp_v4_init_sock(struct sock *sk)
2207 {
2208         struct inet_connection_sock *icsk = inet_csk(sk);
2209
2210         tcp_init_sock(sk);
2211
2212         icsk->icsk_af_ops = &ipv4_specific;
2213
2214 #ifdef CONFIG_TCP_MD5SIG
2215         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2216 #endif
2217
2218         return 0;
2219 }
2220
2221 void tcp_v4_destroy_sock(struct sock *sk)
2222 {
2223         struct tcp_sock *tp = tcp_sk(sk);
2224
2225         trace_tcp_destroy_sock(sk);
2226
2227         tcp_clear_xmit_timers(sk);
2228
2229         tcp_cleanup_congestion_control(sk);
2230
2231         tcp_cleanup_ulp(sk);
2232
2233         /* Cleanup up the write buffer. */
2234         tcp_write_queue_purge(sk);
2235
2236         /* Check if we want to disable active TFO */
2237         tcp_fastopen_active_disable_ofo_check(sk);
2238
2239         /* Cleans up our, hopefully empty, out_of_order_queue. */
2240         skb_rbtree_purge(&tp->out_of_order_queue);
2241
2242 #ifdef CONFIG_TCP_MD5SIG
2243         /* Clean up the MD5 key list, if any */
2244         if (tp->md5sig_info) {
2245                 tcp_clear_md5_list(sk);
2246                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2247                 tp->md5sig_info = NULL;
2248         }
2249 #endif
2250
2251         /* Clean up a referenced TCP bind bucket. */
2252         if (inet_csk(sk)->icsk_bind_hash)
2253                 inet_put_port(sk);
2254
2255         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2256
2257         /* If socket is aborted during connect operation */
2258         tcp_free_fastopen_req(tp);
2259         tcp_fastopen_destroy_cipher(sk);
2260         tcp_saved_syn_free(tp);
2261
2262         sk_sockets_allocated_dec(sk);
2263 }
2264 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2265
2266 #ifdef CONFIG_PROC_FS
2267 /* Proc filesystem TCP sock list dumping. */
2268
2269 /*
2270  * Get next listener socket follow cur.  If cur is NULL, get first socket
2271  * starting from bucket given in st->bucket; when st->bucket is zero the
2272  * very first socket in the hash table is returned.
2273  */
2274 static void *listening_get_next(struct seq_file *seq, void *cur)
2275 {
2276         struct tcp_seq_afinfo *afinfo;
2277         struct tcp_iter_state *st = seq->private;
2278         struct net *net = seq_file_net(seq);
2279         struct inet_listen_hashbucket *ilb;
2280         struct hlist_nulls_node *node;
2281         struct sock *sk = cur;
2282
2283         if (st->bpf_seq_afinfo)
2284                 afinfo = st->bpf_seq_afinfo;
2285         else
2286                 afinfo = PDE_DATA(file_inode(seq->file));
2287
2288         if (!sk) {
2289 get_head:
2290                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2291                 spin_lock(&ilb->lock);
2292                 sk = sk_nulls_head(&ilb->nulls_head);
2293                 st->offset = 0;
2294                 goto get_sk;
2295         }
2296         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2297         ++st->num;
2298         ++st->offset;
2299
2300         sk = sk_nulls_next(sk);
2301 get_sk:
2302         sk_nulls_for_each_from(sk, node) {
2303                 if (!net_eq(sock_net(sk), net))
2304                         continue;
2305                 if (afinfo->family == AF_UNSPEC ||
2306                     sk->sk_family == afinfo->family)
2307                         return sk;
2308         }
2309         spin_unlock(&ilb->lock);
2310         st->offset = 0;
2311         if (++st->bucket < INET_LHTABLE_SIZE)
2312                 goto get_head;
2313         return NULL;
2314 }
2315
2316 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2317 {
2318         struct tcp_iter_state *st = seq->private;
2319         void *rc;
2320
2321         st->bucket = 0;
2322         st->offset = 0;
2323         rc = listening_get_next(seq, NULL);
2324
2325         while (rc && *pos) {
2326                 rc = listening_get_next(seq, rc);
2327                 --*pos;
2328         }
2329         return rc;
2330 }
2331
2332 static inline bool empty_bucket(const struct tcp_iter_state *st)
2333 {
2334         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2335 }
2336
2337 /*
2338  * Get first established socket starting from bucket given in st->bucket.
2339  * If st->bucket is zero, the very first socket in the hash is returned.
2340  */
2341 static void *established_get_first(struct seq_file *seq)
2342 {
2343         struct tcp_seq_afinfo *afinfo;
2344         struct tcp_iter_state *st = seq->private;
2345         struct net *net = seq_file_net(seq);
2346         void *rc = NULL;
2347
2348         if (st->bpf_seq_afinfo)
2349                 afinfo = st->bpf_seq_afinfo;
2350         else
2351                 afinfo = PDE_DATA(file_inode(seq->file));
2352
2353         st->offset = 0;
2354         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2355                 struct sock *sk;
2356                 struct hlist_nulls_node *node;
2357                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2358
2359                 /* Lockless fast path for the common case of empty buckets */
2360                 if (empty_bucket(st))
2361                         continue;
2362
2363                 spin_lock_bh(lock);
2364                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2365                         if ((afinfo->family != AF_UNSPEC &&
2366                              sk->sk_family != afinfo->family) ||
2367                             !net_eq(sock_net(sk), net)) {
2368                                 continue;
2369                         }
2370                         rc = sk;
2371                         goto out;
2372                 }
2373                 spin_unlock_bh(lock);
2374         }
2375 out:
2376         return rc;
2377 }
2378
2379 static void *established_get_next(struct seq_file *seq, void *cur)
2380 {
2381         struct tcp_seq_afinfo *afinfo;
2382         struct sock *sk = cur;
2383         struct hlist_nulls_node *node;
2384         struct tcp_iter_state *st = seq->private;
2385         struct net *net = seq_file_net(seq);
2386
2387         if (st->bpf_seq_afinfo)
2388                 afinfo = st->bpf_seq_afinfo;
2389         else
2390                 afinfo = PDE_DATA(file_inode(seq->file));
2391
2392         ++st->num;
2393         ++st->offset;
2394
2395         sk = sk_nulls_next(sk);
2396
2397         sk_nulls_for_each_from(sk, node) {
2398                 if ((afinfo->family == AF_UNSPEC ||
2399                      sk->sk_family == afinfo->family) &&
2400                     net_eq(sock_net(sk), net))
2401                         return sk;
2402         }
2403
2404         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2405         ++st->bucket;
2406         return established_get_first(seq);
2407 }
2408
2409 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2410 {
2411         struct tcp_iter_state *st = seq->private;
2412         void *rc;
2413
2414         st->bucket = 0;
2415         rc = established_get_first(seq);
2416
2417         while (rc && pos) {
2418                 rc = established_get_next(seq, rc);
2419                 --pos;
2420         }
2421         return rc;
2422 }
2423
2424 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2425 {
2426         void *rc;
2427         struct tcp_iter_state *st = seq->private;
2428
2429         st->state = TCP_SEQ_STATE_LISTENING;
2430         rc        = listening_get_idx(seq, &pos);
2431
2432         if (!rc) {
2433                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2434                 rc        = established_get_idx(seq, pos);
2435         }
2436
2437         return rc;
2438 }
2439
2440 static void *tcp_seek_last_pos(struct seq_file *seq)
2441 {
2442         struct tcp_iter_state *st = seq->private;
2443         int offset = st->offset;
2444         int orig_num = st->num;
2445         void *rc = NULL;
2446
2447         switch (st->state) {
2448         case TCP_SEQ_STATE_LISTENING:
2449                 if (st->bucket >= INET_LHTABLE_SIZE)
2450                         break;
2451                 st->state = TCP_SEQ_STATE_LISTENING;
2452                 rc = listening_get_next(seq, NULL);
2453                 while (offset-- && rc)
2454                         rc = listening_get_next(seq, rc);
2455                 if (rc)
2456                         break;
2457                 st->bucket = 0;
2458                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2459                 fallthrough;
2460         case TCP_SEQ_STATE_ESTABLISHED:
2461                 if (st->bucket > tcp_hashinfo.ehash_mask)
2462                         break;
2463                 rc = established_get_first(seq);
2464                 while (offset-- && rc)
2465                         rc = established_get_next(seq, rc);
2466         }
2467
2468         st->num = orig_num;
2469
2470         return rc;
2471 }
2472
2473 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2474 {
2475         struct tcp_iter_state *st = seq->private;
2476         void *rc;
2477
2478         if (*pos && *pos == st->last_pos) {
2479                 rc = tcp_seek_last_pos(seq);
2480                 if (rc)
2481                         goto out;
2482         }
2483
2484         st->state = TCP_SEQ_STATE_LISTENING;
2485         st->num = 0;
2486         st->bucket = 0;
2487         st->offset = 0;
2488         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2489
2490 out:
2491         st->last_pos = *pos;
2492         return rc;
2493 }
2494 EXPORT_SYMBOL(tcp_seq_start);
2495
2496 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2497 {
2498         struct tcp_iter_state *st = seq->private;
2499         void *rc = NULL;
2500
2501         if (v == SEQ_START_TOKEN) {
2502                 rc = tcp_get_idx(seq, 0);
2503                 goto out;
2504         }
2505
2506         switch (st->state) {
2507         case TCP_SEQ_STATE_LISTENING:
2508                 rc = listening_get_next(seq, v);
2509                 if (!rc) {
2510                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2511                         st->bucket = 0;
2512                         st->offset = 0;
2513                         rc        = established_get_first(seq);
2514                 }
2515                 break;
2516         case TCP_SEQ_STATE_ESTABLISHED:
2517                 rc = established_get_next(seq, v);
2518                 break;
2519         }
2520 out:
2521         ++*pos;
2522         st->last_pos = *pos;
2523         return rc;
2524 }
2525 EXPORT_SYMBOL(tcp_seq_next);
2526
2527 void tcp_seq_stop(struct seq_file *seq, void *v)
2528 {
2529         struct tcp_iter_state *st = seq->private;
2530
2531         switch (st->state) {
2532         case TCP_SEQ_STATE_LISTENING:
2533                 if (v != SEQ_START_TOKEN)
2534                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2535                 break;
2536         case TCP_SEQ_STATE_ESTABLISHED:
2537                 if (v)
2538                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2539                 break;
2540         }
2541 }
2542 EXPORT_SYMBOL(tcp_seq_stop);
2543
2544 static void get_openreq4(const struct request_sock *req,
2545                          struct seq_file *f, int i)
2546 {
2547         const struct inet_request_sock *ireq = inet_rsk(req);
2548         long delta = req->rsk_timer.expires - jiffies;
2549
2550         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2551                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2552                 i,
2553                 ireq->ir_loc_addr,
2554                 ireq->ir_num,
2555                 ireq->ir_rmt_addr,
2556                 ntohs(ireq->ir_rmt_port),
2557                 TCP_SYN_RECV,
2558                 0, 0, /* could print option size, but that is af dependent. */
2559                 1,    /* timers active (only the expire timer) */
2560                 jiffies_delta_to_clock_t(delta),
2561                 req->num_timeout,
2562                 from_kuid_munged(seq_user_ns(f),
2563                                  sock_i_uid(req->rsk_listener)),
2564                 0,  /* non standard timer */
2565                 0, /* open_requests have no inode */
2566                 0,
2567                 req);
2568 }
2569
2570 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2571 {
2572         int timer_active;
2573         unsigned long timer_expires;
2574         const struct tcp_sock *tp = tcp_sk(sk);
2575         const struct inet_connection_sock *icsk = inet_csk(sk);
2576         const struct inet_sock *inet = inet_sk(sk);
2577         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2578         __be32 dest = inet->inet_daddr;
2579         __be32 src = inet->inet_rcv_saddr;
2580         __u16 destp = ntohs(inet->inet_dport);
2581         __u16 srcp = ntohs(inet->inet_sport);
2582         int rx_queue;
2583         int state;
2584
2585         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2586             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2587             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2588                 timer_active    = 1;
2589                 timer_expires   = icsk->icsk_timeout;
2590         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2591                 timer_active    = 4;
2592                 timer_expires   = icsk->icsk_timeout;
2593         } else if (timer_pending(&sk->sk_timer)) {
2594                 timer_active    = 2;
2595                 timer_expires   = sk->sk_timer.expires;
2596         } else {
2597                 timer_active    = 0;
2598                 timer_expires = jiffies;
2599         }
2600
2601         state = inet_sk_state_load(sk);
2602         if (state == TCP_LISTEN)
2603                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2604         else
2605                 /* Because we don't lock the socket,
2606                  * we might find a transient negative value.
2607                  */
2608                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2609                                       READ_ONCE(tp->copied_seq), 0);
2610
2611         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2612                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2613                 i, src, srcp, dest, destp, state,
2614                 READ_ONCE(tp->write_seq) - tp->snd_una,
2615                 rx_queue,
2616                 timer_active,
2617                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2618                 icsk->icsk_retransmits,
2619                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2620                 icsk->icsk_probes_out,
2621                 sock_i_ino(sk),
2622                 refcount_read(&sk->sk_refcnt), sk,
2623                 jiffies_to_clock_t(icsk->icsk_rto),
2624                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2625                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2626                 tp->snd_cwnd,
2627                 state == TCP_LISTEN ?
2628                     fastopenq->max_qlen :
2629                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2630 }
2631
2632 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2633                                struct seq_file *f, int i)
2634 {
2635         long delta = tw->tw_timer.expires - jiffies;
2636         __be32 dest, src;
2637         __u16 destp, srcp;
2638
2639         dest  = tw->tw_daddr;
2640         src   = tw->tw_rcv_saddr;
2641         destp = ntohs(tw->tw_dport);
2642         srcp  = ntohs(tw->tw_sport);
2643
2644         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2645                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2646                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2647                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2648                 refcount_read(&tw->tw_refcnt), tw);
2649 }
2650
2651 #define TMPSZ 150
2652
2653 static int tcp4_seq_show(struct seq_file *seq, void *v)
2654 {
2655         struct tcp_iter_state *st;
2656         struct sock *sk = v;
2657
2658         seq_setwidth(seq, TMPSZ - 1);
2659         if (v == SEQ_START_TOKEN) {
2660                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2661                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2662                            "inode");
2663                 goto out;
2664         }
2665         st = seq->private;
2666
2667         if (sk->sk_state == TCP_TIME_WAIT)
2668                 get_timewait4_sock(v, seq, st->num);
2669         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2670                 get_openreq4(v, seq, st->num);
2671         else
2672                 get_tcp4_sock(v, seq, st->num);
2673 out:
2674         seq_pad(seq, '\n');
2675         return 0;
2676 }
2677
2678 #ifdef CONFIG_BPF_SYSCALL
2679 struct bpf_iter__tcp {
2680         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2681         __bpf_md_ptr(struct sock_common *, sk_common);
2682         uid_t uid __aligned(8);
2683 };
2684
2685 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2686                              struct sock_common *sk_common, uid_t uid)
2687 {
2688         struct bpf_iter__tcp ctx;
2689
2690         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2691         ctx.meta = meta;
2692         ctx.sk_common = sk_common;
2693         ctx.uid = uid;
2694         return bpf_iter_run_prog(prog, &ctx);
2695 }
2696
2697 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2698 {
2699         struct bpf_iter_meta meta;
2700         struct bpf_prog *prog;
2701         struct sock *sk = v;
2702         uid_t uid;
2703
2704         if (v == SEQ_START_TOKEN)
2705                 return 0;
2706
2707         if (sk->sk_state == TCP_TIME_WAIT) {
2708                 uid = 0;
2709         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2710                 const struct request_sock *req = v;
2711
2712                 uid = from_kuid_munged(seq_user_ns(seq),
2713                                        sock_i_uid(req->rsk_listener));
2714         } else {
2715                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2716         }
2717
2718         meta.seq = seq;
2719         prog = bpf_iter_get_info(&meta, false);
2720         return tcp_prog_seq_show(prog, &meta, v, uid);
2721 }
2722
2723 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2724 {
2725         struct bpf_iter_meta meta;
2726         struct bpf_prog *prog;
2727
2728         if (!v) {
2729                 meta.seq = seq;
2730                 prog = bpf_iter_get_info(&meta, true);
2731                 if (prog)
2732                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2733         }
2734
2735         tcp_seq_stop(seq, v);
2736 }
2737
2738 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2739         .show           = bpf_iter_tcp_seq_show,
2740         .start          = tcp_seq_start,
2741         .next           = tcp_seq_next,
2742         .stop           = bpf_iter_tcp_seq_stop,
2743 };
2744 #endif
2745
2746 static const struct seq_operations tcp4_seq_ops = {
2747         .show           = tcp4_seq_show,
2748         .start          = tcp_seq_start,
2749         .next           = tcp_seq_next,
2750         .stop           = tcp_seq_stop,
2751 };
2752
2753 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2754         .family         = AF_INET,
2755 };
2756
2757 static int __net_init tcp4_proc_init_net(struct net *net)
2758 {
2759         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2760                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2761                 return -ENOMEM;
2762         return 0;
2763 }
2764
2765 static void __net_exit tcp4_proc_exit_net(struct net *net)
2766 {
2767         remove_proc_entry("tcp", net->proc_net);
2768 }
2769
2770 static struct pernet_operations tcp4_net_ops = {
2771         .init = tcp4_proc_init_net,
2772         .exit = tcp4_proc_exit_net,
2773 };
2774
2775 int __init tcp4_proc_init(void)
2776 {
2777         return register_pernet_subsys(&tcp4_net_ops);
2778 }
2779
2780 void tcp4_proc_exit(void)
2781 {
2782         unregister_pernet_subsys(&tcp4_net_ops);
2783 }
2784 #endif /* CONFIG_PROC_FS */
2785
2786 /* @wake is one when sk_stream_write_space() calls us.
2787  * This sends EPOLLOUT only if notsent_bytes is half the limit.
2788  * This mimics the strategy used in sock_def_write_space().
2789  */
2790 bool tcp_stream_memory_free(const struct sock *sk, int wake)
2791 {
2792         const struct tcp_sock *tp = tcp_sk(sk);
2793         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2794                             READ_ONCE(tp->snd_nxt);
2795
2796         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2797 }
2798 EXPORT_SYMBOL(tcp_stream_memory_free);
2799
2800 struct proto tcp_prot = {
2801         .name                   = "TCP",
2802         .owner                  = THIS_MODULE,
2803         .close                  = tcp_close,
2804         .pre_connect            = tcp_v4_pre_connect,
2805         .connect                = tcp_v4_connect,
2806         .disconnect             = tcp_disconnect,
2807         .accept                 = inet_csk_accept,
2808         .ioctl                  = tcp_ioctl,
2809         .init                   = tcp_v4_init_sock,
2810         .destroy                = tcp_v4_destroy_sock,
2811         .shutdown               = tcp_shutdown,
2812         .setsockopt             = tcp_setsockopt,
2813         .getsockopt             = tcp_getsockopt,
2814         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
2815         .keepalive              = tcp_set_keepalive,
2816         .recvmsg                = tcp_recvmsg,
2817         .sendmsg                = tcp_sendmsg,
2818         .sendpage               = tcp_sendpage,
2819         .backlog_rcv            = tcp_v4_do_rcv,
2820         .release_cb             = tcp_release_cb,
2821         .hash                   = inet_hash,
2822         .unhash                 = inet_unhash,
2823         .get_port               = inet_csk_get_port,
2824 #ifdef CONFIG_BPF_SYSCALL
2825         .psock_update_sk_prot   = tcp_bpf_update_proto,
2826 #endif
2827         .enter_memory_pressure  = tcp_enter_memory_pressure,
2828         .leave_memory_pressure  = tcp_leave_memory_pressure,
2829         .stream_memory_free     = tcp_stream_memory_free,
2830         .sockets_allocated      = &tcp_sockets_allocated,
2831         .orphan_count           = &tcp_orphan_count,
2832         .memory_allocated       = &tcp_memory_allocated,
2833         .memory_pressure        = &tcp_memory_pressure,
2834         .sysctl_mem             = sysctl_tcp_mem,
2835         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2836         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2837         .max_header             = MAX_TCP_HEADER,
2838         .obj_size               = sizeof(struct tcp_sock),
2839         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2840         .twsk_prot              = &tcp_timewait_sock_ops,
2841         .rsk_prot               = &tcp_request_sock_ops,
2842         .h.hashinfo             = &tcp_hashinfo,
2843         .no_autobind            = true,
2844         .diag_destroy           = tcp_abort,
2845 };
2846 EXPORT_SYMBOL(tcp_prot);
2847
2848 static void __net_exit tcp_sk_exit(struct net *net)
2849 {
2850         int cpu;
2851
2852         if (net->ipv4.tcp_congestion_control)
2853                 bpf_module_put(net->ipv4.tcp_congestion_control,
2854                                net->ipv4.tcp_congestion_control->owner);
2855
2856         for_each_possible_cpu(cpu)
2857                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2858         free_percpu(net->ipv4.tcp_sk);
2859 }
2860
2861 static int __net_init tcp_sk_init(struct net *net)
2862 {
2863         int res, cpu, cnt;
2864
2865         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2866         if (!net->ipv4.tcp_sk)
2867                 return -ENOMEM;
2868
2869         for_each_possible_cpu(cpu) {
2870                 struct sock *sk;
2871
2872                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2873                                            IPPROTO_TCP, net);
2874                 if (res)
2875                         goto fail;
2876                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2877
2878                 /* Please enforce IP_DF and IPID==0 for RST and
2879                  * ACK sent in SYN-RECV and TIME-WAIT state.
2880                  */
2881                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2882
2883                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2884         }
2885
2886         net->ipv4.sysctl_tcp_ecn = 2;
2887         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2888
2889         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2890         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2891         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2892         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2893         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2894
2895         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2896         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2897         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2898
2899         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2900         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2901         net->ipv4.sysctl_tcp_syncookies = 1;
2902         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2903         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2904         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2905         net->ipv4.sysctl_tcp_orphan_retries = 0;
2906         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2907         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2908         net->ipv4.sysctl_tcp_tw_reuse = 2;
2909         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2910
2911         cnt = tcp_hashinfo.ehash_mask + 1;
2912         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2913         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2914
2915         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2916         net->ipv4.sysctl_tcp_sack = 1;
2917         net->ipv4.sysctl_tcp_window_scaling = 1;
2918         net->ipv4.sysctl_tcp_timestamps = 1;
2919         net->ipv4.sysctl_tcp_early_retrans = 3;
2920         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2921         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2922         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2923         net->ipv4.sysctl_tcp_max_reordering = 300;
2924         net->ipv4.sysctl_tcp_dsack = 1;
2925         net->ipv4.sysctl_tcp_app_win = 31;
2926         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2927         net->ipv4.sysctl_tcp_frto = 2;
2928         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2929         /* This limits the percentage of the congestion window which we
2930          * will allow a single TSO frame to consume.  Building TSO frames
2931          * which are too large can cause TCP streams to be bursty.
2932          */
2933         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2934         /* Default TSQ limit of 16 TSO segments */
2935         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2936         /* rfc5961 challenge ack rate limiting */
2937         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2938         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2939         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2940         net->ipv4.sysctl_tcp_autocorking = 1;
2941         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2942         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2943         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2944         if (net != &init_net) {
2945                 memcpy(net->ipv4.sysctl_tcp_rmem,
2946                        init_net.ipv4.sysctl_tcp_rmem,
2947                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2948                 memcpy(net->ipv4.sysctl_tcp_wmem,
2949                        init_net.ipv4.sysctl_tcp_wmem,
2950                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2951         }
2952         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2953         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2954         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2955         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2956         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2957         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2958         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2959
2960         /* Reno is always built in */
2961         if (!net_eq(net, &init_net) &&
2962             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2963                                init_net.ipv4.tcp_congestion_control->owner))
2964                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2965         else
2966                 net->ipv4.tcp_congestion_control = &tcp_reno;
2967
2968         return 0;
2969 fail:
2970         tcp_sk_exit(net);
2971
2972         return res;
2973 }
2974
2975 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2976 {
2977         struct net *net;
2978
2979         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2980
2981         list_for_each_entry(net, net_exit_list, exit_list)
2982                 tcp_fastopen_ctx_destroy(net);
2983 }
2984
2985 static struct pernet_operations __net_initdata tcp_sk_ops = {
2986        .init       = tcp_sk_init,
2987        .exit       = tcp_sk_exit,
2988        .exit_batch = tcp_sk_exit_batch,
2989 };
2990
2991 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2992 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2993                      struct sock_common *sk_common, uid_t uid)
2994
2995 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2996 {
2997         struct tcp_iter_state *st = priv_data;
2998         struct tcp_seq_afinfo *afinfo;
2999         int ret;
3000
3001         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
3002         if (!afinfo)
3003                 return -ENOMEM;
3004
3005         afinfo->family = AF_UNSPEC;
3006         st->bpf_seq_afinfo = afinfo;
3007         ret = bpf_iter_init_seq_net(priv_data, aux);
3008         if (ret)
3009                 kfree(afinfo);
3010         return ret;
3011 }
3012
3013 static void bpf_iter_fini_tcp(void *priv_data)
3014 {
3015         struct tcp_iter_state *st = priv_data;
3016
3017         kfree(st->bpf_seq_afinfo);
3018         bpf_iter_fini_seq_net(priv_data);
3019 }
3020
3021 static const struct bpf_iter_seq_info tcp_seq_info = {
3022         .seq_ops                = &bpf_iter_tcp_seq_ops,
3023         .init_seq_private       = bpf_iter_init_tcp,
3024         .fini_seq_private       = bpf_iter_fini_tcp,
3025         .seq_priv_size          = sizeof(struct tcp_iter_state),
3026 };
3027
3028 static struct bpf_iter_reg tcp_reg_info = {
3029         .target                 = "tcp",
3030         .ctx_arg_info_size      = 1,
3031         .ctx_arg_info           = {
3032                 { offsetof(struct bpf_iter__tcp, sk_common),
3033                   PTR_TO_BTF_ID_OR_NULL },
3034         },
3035         .seq_info               = &tcp_seq_info,
3036 };
3037
3038 static void __init bpf_iter_register(void)
3039 {
3040         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3041         if (bpf_iter_reg_target(&tcp_reg_info))
3042                 pr_warn("Warning: could not register bpf iterator tcp\n");
3043 }
3044
3045 #endif
3046
3047 void __init tcp_v4_init(void)
3048 {
3049         if (register_pernet_subsys(&tcp_sk_ops))
3050                 panic("Failed to create the TCP control socket.\n");
3051
3052 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3053         bpf_iter_register();
3054 #endif
3055 }