net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         WRITE_ONCE(tp->mtu_info, info);
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 #ifdef CONFIG_TCP_MD5SIG
 659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 660 #else
 661 #define OPTION_BYTES sizeof(__be32)
 662 #endif
 663
 664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 665 {
 666         const struct tcphdr *th = tcp_hdr(skb);
 667         struct {
 668                 struct tcphdr th;
 669                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 670         } rep;
 671         struct ip_reply_arg arg;
 672 #ifdef CONFIG_TCP_MD5SIG
 673         struct tcp_md5sig_key *key = NULL;
 674         const __u8 *hash_location = NULL;
 675         unsigned char newhash[16];
 676         int genhash;
 677         struct sock *sk1 = NULL;
 678 #endif
 679         u64 transmit_time = 0;
 680         struct sock *ctl_sk;
 681         struct net *net;
 682
 683         /* Never send a reset in response to a reset. */
 684         if (th->rst)
 685                 return;
 686
 687         /* If sk not NULL, it means we did a successful lookup and incoming
 688          * route had to be correct. prequeue might have dropped our dst.
 689          */
 690         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 691                 return;
 692
 693         /* Swap the send and the receive. */
 694         memset(&rep, 0, sizeof(rep));
 695         rep.th.dest   = th->source;
 696         rep.th.source = th->dest;
 697         rep.th.doff   = sizeof(struct tcphdr) / 4;
 698         rep.th.rst    = 1;
 699
 700         if (th->ack) {
 701                 rep.th.seq = th->ack_seq;
 702         } else {
 703                 rep.th.ack = 1;
 704                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 705                                        skb->len - (th->doff << 2));
 706         }
 707
 708         memset(&arg, 0, sizeof(arg));
 709         arg.iov[0].iov_base = (unsigned char *)&rep;
 710         arg.iov[0].iov_len  = sizeof(rep.th);
 711
 712         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 713 #ifdef CONFIG_TCP_MD5SIG
 714         rcu_read_lock();
 715         hash_location = tcp_parse_md5sig_option(th);
 716         if (sk && sk_fullsock(sk)) {
 717                 const union tcp_md5_addr *addr;
 718                 int l3index;
 719
 720                 /* sdif set, means packet ingressed via a device
 721                  * in an L3 domain and inet_iif is set to it.
 722                  */
 723                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 724                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 725                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 726         } else if (hash_location) {
 727                 const union tcp_md5_addr *addr;
 728                 int sdif = tcp_v4_sdif(skb);
 729                 int dif = inet_iif(skb);
 730                 int l3index;
 731
 732                 /*
 733                  * active side is lost. Try to find listening socket through
 734                  * source port, and then find md5 key through listening socket.
 735                  * we are not loose security here:
 736                  * Incoming packet is checked with md5 hash with finding key,
 737                  * no RST generated if md5 hash doesn't match.
 738                  */
 739                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 740                                              ip_hdr(skb)->saddr,
 741                                              th->source, ip_hdr(skb)->daddr,
 742                                              ntohs(th->source), dif, sdif);
 743                 /* don't send rst if it can't find key */
 744                 if (!sk1)
 745                         goto out;
 746
 747                 /* sdif set, means packet ingressed via a device
 748                  * in an L3 domain and dif is set to it.
 749                  */
 750                 l3index = sdif ? dif : 0;
 751                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 752                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 753                 if (!key)
 754                         goto out;
 755
 756
 757                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 758                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 759                         goto out;
 760
 761         }
 762
 763         if (key) {
 764                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 765                                    (TCPOPT_NOP << 16) |
 766                                    (TCPOPT_MD5SIG << 8) |
 767                                    TCPOLEN_MD5SIG);
 768                 /* Update length and the length the header thinks exists */
 769                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 770                 rep.th.doff = arg.iov[0].iov_len / 4;
 771
 772                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 773                                      key, ip_hdr(skb)->saddr,
 774                                      ip_hdr(skb)->daddr, &rep.th);
 775         }
 776 #endif
 777         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 778         if (rep.opt[0] == 0) {
 779                 __be32 mrst = mptcp_reset_option(skb);
 780
 781                 if (mrst) {
 782                         rep.opt[0] = mrst;
 783                         arg.iov[0].iov_len += sizeof(mrst);
 784                         rep.th.doff = arg.iov[0].iov_len / 4;
 785                 }
 786         }
 787
 788         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 789                                       ip_hdr(skb)->saddr, /* XXX */
 790                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 791         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 792         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 793
 794         /* When socket is gone, all binding information is lost.
 795          * routing might fail in this case. No choice here, if we choose to force
 796          * input interface, we will misroute in case of asymmetric route.
 797          */
 798         if (sk) {
 799                 arg.bound_dev_if = sk->sk_bound_dev_if;
 800                 if (sk_fullsock(sk))
 801                         trace_tcp_send_reset(sk, skb);
 802         }
 803
 804         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 805                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 806
 807         arg.tos = ip_hdr(skb)->tos;
 808         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 809         local_bh_disable();
 810         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 811         if (sk) {
 812                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 813                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 814                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 815                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 816                 transmit_time = tcp_transmit_time(sk);
 817         }
 818         ip_send_unicast_reply(ctl_sk,
 819                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 820                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 821                               &arg, arg.iov[0].iov_len,
 822                               transmit_time);
 823
 824         ctl_sk->sk_mark = 0;
 825         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 826         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 827         local_bh_enable();
 828
 829 #ifdef CONFIG_TCP_MD5SIG
 830 out:
 831         rcu_read_unlock();
 832 #endif
 833 }
 834
 835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 836    outside socket context is ugly, certainly. What can I do?
 837  */
 838
 839 static void tcp_v4_send_ack(const struct sock *sk,
 840                             struct sk_buff *skb, u32 seq, u32 ack,
 841                             u32 win, u32 tsval, u32 tsecr, int oif,
 842                             struct tcp_md5sig_key *key,
 843                             int reply_flags, u8 tos)
 844 {
 845         const struct tcphdr *th = tcp_hdr(skb);
 846         struct {
 847                 struct tcphdr th;
 848                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 849 #ifdef CONFIG_TCP_MD5SIG
 850                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 851 #endif
 852                         ];
 853         } rep;
 854         struct net *net = sock_net(sk);
 855         struct ip_reply_arg arg;
 856         struct sock *ctl_sk;
 857         u64 transmit_time;
 858
 859         memset(&rep.th, 0, sizeof(struct tcphdr));
 860         memset(&arg, 0, sizeof(arg));
 861
 862         arg.iov[0].iov_base = (unsigned char *)&rep;
 863         arg.iov[0].iov_len  = sizeof(rep.th);
 864         if (tsecr) {
 865                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 866                                    (TCPOPT_TIMESTAMP << 8) |
 867                                    TCPOLEN_TIMESTAMP);
 868                 rep.opt[1] = htonl(tsval);
 869                 rep.opt[2] = htonl(tsecr);
 870                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 871         }
 872
 873         /* Swap the send and the receive. */
 874         rep.th.dest    = th->source;
 875         rep.th.source  = th->dest;
 876         rep.th.doff    = arg.iov[0].iov_len / 4;
 877         rep.th.seq     = htonl(seq);
 878         rep.th.ack_seq = htonl(ack);
 879         rep.th.ack     = 1;
 880         rep.th.window  = htons(win);
 881
 882 #ifdef CONFIG_TCP_MD5SIG
 883         if (key) {
 884                 int offset = (tsecr) ? 3 : 0;
 885
 886                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 887                                           (TCPOPT_NOP << 16) |
 888                                           (TCPOPT_MD5SIG << 8) |
 889                                           TCPOLEN_MD5SIG);
 890                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 891                 rep.th.doff = arg.iov[0].iov_len/4;
 892
 893                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 894                                     key, ip_hdr(skb)->saddr,
 895                                     ip_hdr(skb)->daddr, &rep.th);
 896         }
 897 #endif
 898         arg.flags = reply_flags;
 899         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 900                                       ip_hdr(skb)->saddr, /* XXX */
 901                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 902         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 903         if (oif)
 904                 arg.bound_dev_if = oif;
 905         arg.tos = tos;
 906         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 907         local_bh_disable();
 908         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 909         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 910                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 911         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 912                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 913         transmit_time = tcp_transmit_time(sk);
 914         ip_send_unicast_reply(ctl_sk,
 915                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 916                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 917                               &arg, arg.iov[0].iov_len,
 918                               transmit_time);
 919
 920         ctl_sk->sk_mark = 0;
 921         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 922         local_bh_enable();
 923 }
 924
 925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 926 {
 927         struct inet_timewait_sock *tw = inet_twsk(sk);
 928         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 929
 930         tcp_v4_send_ack(sk, skb,
 931                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 932                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 933                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 934                         tcptw->tw_ts_recent,
 935                         tw->tw_bound_dev_if,
 936                         tcp_twsk_md5_key(tcptw),
 937                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 938                         tw->tw_tos
 939                         );
 940
 941         inet_twsk_put(tw);
 942 }
 943
 944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 945                                   struct request_sock *req)
 946 {
 947         const union tcp_md5_addr *addr;
 948         int l3index;
 949
 950         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 951          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 952          */
 953         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 954                                              tcp_sk(sk)->snd_nxt;
 955
 956         /* RFC 7323 2.3
 957          * The window field (SEG.WND) of every outgoing segment, with the
 958          * exception of <SYN> segments, MUST be right-shifted by
 959          * Rcv.Wind.Shift bits:
 960          */
 961         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 962         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 963         tcp_v4_send_ack(sk, skb, seq,
 964                         tcp_rsk(req)->rcv_nxt,
 965                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 966                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 967                         req->ts_recent,
 968                         0,
 969                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 970                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 971                         ip_hdr(skb)->tos);
 972 }
 973
 974 /*
 975  *      Send a SYN-ACK after having received a SYN.
 976  *      This still operates on a request_sock only, not on a big
 977  *      socket.
 978  */
 979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 980                               struct flowi *fl,
 981                               struct request_sock *req,
 982                               struct tcp_fastopen_cookie *foc,
 983                               enum tcp_synack_type synack_type,
 984                               struct sk_buff *syn_skb)
 985 {
 986         const struct inet_request_sock *ireq = inet_rsk(req);
 987         struct flowi4 fl4;
 988         int err = -1;
 989         struct sk_buff *skb;
 990         u8 tos;
 991
 992         /* First, grab a route. */
 993         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 994                 return -1;
 995
 996         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 997
 998         if (skb) {
 999                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004                                 inet_sk(sk)->tos;
1005
1006                 if (!INET_ECN_is_capable(tos) &&
1007                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1008                         tos |= INET_ECN_ECT_0;
1009
1010                 rcu_read_lock();
1011                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012                                             ireq->ir_rmt_addr,
1013                                             rcu_dereference(ireq->ireq_opt),
1014                                             tos);
1015                 rcu_read_unlock();
1016                 err = net_xmit_eval(err);
1017         }
1018
1019         return err;
1020 }
1021
1022 /*
1023  *      IPv4 request_sock destructor.
1024  */
1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026 {
1027         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028 }
1029
1030 #ifdef CONFIG_TCP_MD5SIG
1031 /*
1032  * RFC2385 MD5 checksumming requires a mapping of
1033  * IP address->MD5 Key.
1034  * We need to maintain these in the sk structure.
1035  */
1036
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1039
1040 /* Find the Key structure for an address.  */
1041 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1042                                            const union tcp_md5_addr *addr,
1043                                            int family)
1044 {
1045         const struct tcp_sock *tp = tcp_sk(sk);
1046         struct tcp_md5sig_key *key;
1047         const struct tcp_md5sig_info *md5sig;
1048         __be32 mask;
1049         struct tcp_md5sig_key *best_match = NULL;
1050         bool match;
1051
1052         /* caller either holds rcu_read_lock() or socket lock */
1053         md5sig = rcu_dereference_check(tp->md5sig_info,
1054                                        lockdep_sock_is_held(sk));
1055         if (!md5sig)
1056                 return NULL;
1057
1058         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1059                                  lockdep_sock_is_held(sk)) {
1060                 if (key->family != family)
1061                         continue;
1062                 if (key->l3index && key->l3index != l3index)
1063                         continue;
1064                 if (family == AF_INET) {
1065                         mask = inet_make_mask(key->prefixlen);
1066                         match = (key->addr.a4.s_addr & mask) ==
1067                                 (addr->a4.s_addr & mask);
1068 #if IS_ENABLED(CONFIG_IPV6)
1069                 } else if (family == AF_INET6) {
1070                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1071                                                   key->prefixlen);
1072 #endif
1073                 } else {
1074                         match = false;
1075                 }
1076
1077                 if (match && (!best_match ||
1078                               key->prefixlen > best_match->prefixlen))
1079                         best_match = key;
1080         }
1081         return best_match;
1082 }
1083 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084
1085 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086                                                       const union tcp_md5_addr *addr,
1087                                                       int family, u8 prefixlen,
1088                                                       int l3index)
1089 {
1090         const struct tcp_sock *tp = tcp_sk(sk);
1091         struct tcp_md5sig_key *key;
1092         unsigned int size = sizeof(struct in_addr);
1093         const struct tcp_md5sig_info *md5sig;
1094
1095         /* caller either holds rcu_read_lock() or socket lock */
1096         md5sig = rcu_dereference_check(tp->md5sig_info,
1097                                        lockdep_sock_is_held(sk));
1098         if (!md5sig)
1099                 return NULL;
1100 #if IS_ENABLED(CONFIG_IPV6)
1101         if (family == AF_INET6)
1102                 size = sizeof(struct in6_addr);
1103 #endif
1104         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105                                  lockdep_sock_is_held(sk)) {
1106                 if (key->family != family)
1107                         continue;
1108                 if (key->l3index && key->l3index != l3index)
1109                         continue;
1110                 if (!memcmp(&key->addr, addr, size) &&
1111                     key->prefixlen == prefixlen)
1112                         return key;
1113         }
1114         return NULL;
1115 }
1116
1117 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118                                          const struct sock *addr_sk)
1119 {
1120         const union tcp_md5_addr *addr;
1121         int l3index;
1122
1123         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124                                                  addr_sk->sk_bound_dev_if);
1125         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129
1130 /* This can be called on a newly created socket, from other files */
1131 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132                    int family, u8 prefixlen, int l3index,
1133                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134 {
1135         /* Add Key to the list */
1136         struct tcp_md5sig_key *key;
1137         struct tcp_sock *tp = tcp_sk(sk);
1138         struct tcp_md5sig_info *md5sig;
1139
1140         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141         if (key) {
1142                 /* Pre-existing entry - just update that one.
1143                  * Note that the key might be used concurrently.
1144                  * data_race() is telling kcsan that we do not care of
1145                  * key mismatches, since changing MD5 key on live flows
1146                  * can lead to packet drops.
1147                  */
1148                 data_race(memcpy(key->key, newkey, newkeylen));
1149
1150                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1151                  * Also note that a reader could catch new key->keylen value
1152                  * but old key->key[], this is the reason we use __GFP_ZERO
1153                  * at sock_kmalloc() time below these lines.
1154                  */
1155                 WRITE_ONCE(key->keylen, newkeylen);
1156
1157                 return 0;
1158         }
1159
1160         md5sig = rcu_dereference_protected(tp->md5sig_info,
1161                                            lockdep_sock_is_held(sk));
1162         if (!md5sig) {
1163                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1164                 if (!md5sig)
1165                         return -ENOMEM;
1166
1167                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168                 INIT_HLIST_HEAD(&md5sig->head);
1169                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1170         }
1171
1172         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173         if (!key)
1174                 return -ENOMEM;
1175         if (!tcp_alloc_md5sig_pool()) {
1176                 sock_kfree_s(sk, key, sizeof(*key));
1177                 return -ENOMEM;
1178         }
1179
1180         memcpy(key->key, newkey, newkeylen);
1181         key->keylen = newkeylen;
1182         key->family = family;
1183         key->prefixlen = prefixlen;
1184         key->l3index = l3index;
1185         memcpy(&key->addr, addr,
1186                (family == AF_INET6) ? sizeof(struct in6_addr) :
1187                                       sizeof(struct in_addr));
1188         hlist_add_head_rcu(&key->node, &md5sig->head);
1189         return 0;
1190 }
1191 EXPORT_SYMBOL(tcp_md5_do_add);
1192
1193 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194                    u8 prefixlen, int l3index)
1195 {
1196         struct tcp_md5sig_key *key;
1197
1198         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199         if (!key)
1200                 return -ENOENT;
1201         hlist_del_rcu(&key->node);
1202         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1203         kfree_rcu(key, rcu);
1204         return 0;
1205 }
1206 EXPORT_SYMBOL(tcp_md5_do_del);
1207
1208 static void tcp_clear_md5_list(struct sock *sk)
1209 {
1210         struct tcp_sock *tp = tcp_sk(sk);
1211         struct tcp_md5sig_key *key;
1212         struct hlist_node *n;
1213         struct tcp_md5sig_info *md5sig;
1214
1215         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216
1217         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1218                 hlist_del_rcu(&key->node);
1219                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1220                 kfree_rcu(key, rcu);
1221         }
1222 }
1223
1224 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225                                  sockptr_t optval, int optlen)
1226 {
1227         struct tcp_md5sig cmd;
1228         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229         const union tcp_md5_addr *addr;
1230         u8 prefixlen = 32;
1231         int l3index = 0;
1232
1233         if (optlen < sizeof(cmd))
1234                 return -EINVAL;
1235
1236         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237                 return -EFAULT;
1238
1239         if (sin->sin_family != AF_INET)
1240                 return -EINVAL;
1241
1242         if (optname == TCP_MD5SIG_EXT &&
1243             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244                 prefixlen = cmd.tcpm_prefixlen;
1245                 if (prefixlen > 32)
1246                         return -EINVAL;
1247         }
1248
1249         if (optname == TCP_MD5SIG_EXT &&
1250             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251                 struct net_device *dev;
1252
1253                 rcu_read_lock();
1254                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255                 if (dev && netif_is_l3_master(dev))
1256                         l3index = dev->ifindex;
1257
1258                 rcu_read_unlock();
1259
1260                 /* ok to reference set/not set outside of rcu;
1261                  * right now device MUST be an L3 master
1262                  */
1263                 if (!dev || !l3index)
1264                         return -EINVAL;
1265         }
1266
1267         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
1269         if (!cmd.tcpm_keylen)
1270                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271
1272         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273                 return -EINVAL;
1274
1275         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277 }
1278
1279 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280                                    __be32 daddr, __be32 saddr,
1281                                    const struct tcphdr *th, int nbytes)
1282 {
1283         struct tcp4_pseudohdr *bp;
1284         struct scatterlist sg;
1285         struct tcphdr *_th;
1286
1287         bp = hp->scratch;
1288         bp->saddr = saddr;
1289         bp->daddr = daddr;
1290         bp->pad = 0;
1291         bp->protocol = IPPROTO_TCP;
1292         bp->len = cpu_to_be16(nbytes);
1293
1294         _th = (struct tcphdr *)(bp + 1);
1295         memcpy(_th, th, sizeof(*th));
1296         _th->check = 0;
1297
1298         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300                                 sizeof(*bp) + sizeof(*th));
1301         return crypto_ahash_update(hp->md5_req);
1302 }
1303
1304 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1305                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306 {
1307         struct tcp_md5sig_pool *hp;
1308         struct ahash_request *req;
1309
1310         hp = tcp_get_md5sig_pool();
1311         if (!hp)
1312                 goto clear_hash_noput;
1313         req = hp->md5_req;
1314
1315         if (crypto_ahash_init(req))
1316                 goto clear_hash;
1317         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318                 goto clear_hash;
1319         if (tcp_md5_hash_key(hp, key))
1320                 goto clear_hash;
1321         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322         if (crypto_ahash_final(req))
1323                 goto clear_hash;
1324
1325         tcp_put_md5sig_pool();
1326         return 0;
1327
1328 clear_hash:
1329         tcp_put_md5sig_pool();
1330 clear_hash_noput:
1331         memset(md5_hash, 0, 16);
1332         return 1;
1333 }
1334
1335 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336                         const struct sock *sk,
1337                         const struct sk_buff *skb)
1338 {
1339         struct tcp_md5sig_pool *hp;
1340         struct ahash_request *req;
1341         const struct tcphdr *th = tcp_hdr(skb);
1342         __be32 saddr, daddr;
1343
1344         if (sk) { /* valid for establish/request sockets */
1345                 saddr = sk->sk_rcv_saddr;
1346                 daddr = sk->sk_daddr;
1347         } else {
1348                 const struct iphdr *iph = ip_hdr(skb);
1349                 saddr = iph->saddr;
1350                 daddr = iph->daddr;
1351         }
1352
1353         hp = tcp_get_md5sig_pool();
1354         if (!hp)
1355                 goto clear_hash_noput;
1356         req = hp->md5_req;
1357
1358         if (crypto_ahash_init(req))
1359                 goto clear_hash;
1360
1361         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362                 goto clear_hash;
1363         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364                 goto clear_hash;
1365         if (tcp_md5_hash_key(hp, key))
1366                 goto clear_hash;
1367         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368         if (crypto_ahash_final(req))
1369                 goto clear_hash;
1370
1371         tcp_put_md5sig_pool();
1372         return 0;
1373
1374 clear_hash:
1375         tcp_put_md5sig_pool();
1376 clear_hash_noput:
1377         memset(md5_hash, 0, 16);
1378         return 1;
1379 }
1380 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381
1382 #endif
1383
1384 /* Called with rcu_read_lock() */
1385 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386                                     const struct sk_buff *skb,
1387                                     int dif, int sdif)
1388 {
1389 #ifdef CONFIG_TCP_MD5SIG
1390         /*
1391          * This gets called for each TCP segment that arrives
1392          * so we want to be efficient.
1393          * We have 3 drop cases:
1394          * o No MD5 hash and one expected.
1395          * o MD5 hash and we're not expecting one.
1396          * o MD5 hash and its wrong.
1397          */
1398         const __u8 *hash_location = NULL;
1399         struct tcp_md5sig_key *hash_expected;
1400         const struct iphdr *iph = ip_hdr(skb);
1401         const struct tcphdr *th = tcp_hdr(skb);
1402         const union tcp_md5_addr *addr;
1403         unsigned char newhash[16];
1404         int genhash, l3index;
1405
1406         /* sdif set, means packet ingressed via a device
1407          * in an L3 domain and dif is set to the l3mdev
1408          */
1409         l3index = sdif ? dif : 0;
1410
1411         addr = (union tcp_md5_addr *)&iph->saddr;
1412         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413         hash_location = tcp_parse_md5sig_option(th);
1414
1415         /* We've parsed the options - do we have a hash? */
1416         if (!hash_expected && !hash_location)
1417                 return false;
1418
1419         if (hash_expected && !hash_location) {
1420                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1421                 return true;
1422         }
1423
1424         if (!hash_expected && hash_location) {
1425                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1426                 return true;
1427         }
1428
1429         /* Okay, so this is hash_expected and hash_location -
1430          * so we need to calculate the checksum.
1431          */
1432         genhash = tcp_v4_md5_hash_skb(newhash,
1433                                       hash_expected,
1434                                       NULL, skb);
1435
1436         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439                                      &iph->saddr, ntohs(th->source),
1440                                      &iph->daddr, ntohs(th->dest),
1441                                      genhash ? " tcp_v4_calc_md5_hash failed"
1442                                      : "", l3index);
1443                 return true;
1444         }
1445         return false;
1446 #endif
1447         return false;
1448 }
1449
1450 static void tcp_v4_init_req(struct request_sock *req,
1451                             const struct sock *sk_listener,
1452                             struct sk_buff *skb)
1453 {
1454         struct inet_request_sock *ireq = inet_rsk(req);
1455         struct net *net = sock_net(sk_listener);
1456
1457         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1459         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460 }
1461
1462 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463                                           struct sk_buff *skb,
1464                                           struct flowi *fl,
1465                                           struct request_sock *req)
1466 {
1467         tcp_v4_init_req(req, sk, skb);
1468
1469         if (security_inet_conn_request(sk, skb, req))
1470                 return NULL;
1471
1472         return inet_csk_route_req(sk, &fl->u.ip4, req);
1473 }
1474
1475 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1476         .family         =       PF_INET,
1477         .obj_size       =       sizeof(struct tcp_request_sock),
1478         .rtx_syn_ack    =       tcp_rtx_synack,
1479         .send_ack       =       tcp_v4_reqsk_send_ack,
1480         .destructor     =       tcp_v4_reqsk_destructor,
1481         .send_reset     =       tcp_v4_send_reset,
1482         .syn_ack_timeout =      tcp_syn_ack_timeout,
1483 };
1484
1485 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1486         .mss_clamp      =       TCP_MSS_DEFAULT,
1487 #ifdef CONFIG_TCP_MD5SIG
1488         .req_md5_lookup =       tcp_v4_md5_lookup,
1489         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1490 #endif
1491 #ifdef CONFIG_SYN_COOKIES
1492         .cookie_init_seq =      cookie_v4_init_sequence,
1493 #endif
1494         .route_req      =       tcp_v4_route_req,
1495         .init_seq       =       tcp_v4_init_seq,
1496         .init_ts_off    =       tcp_v4_init_ts_off,
1497         .send_synack    =       tcp_v4_send_synack,
1498 };
1499
1500 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1501 {
1502         /* Never answer to SYNs send to broadcast or multicast */
1503         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1504                 goto drop;
1505
1506         return tcp_conn_request(&tcp_request_sock_ops,
1507                                 &tcp_request_sock_ipv4_ops, sk, skb);
1508
1509 drop:
1510         tcp_listendrop(sk);
1511         return 0;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_conn_request);
1514
1515
1516 /*
1517  * The three way handshake has completed - we got a valid synack -
1518  * now create the new socket.
1519  */
1520 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1521                                   struct request_sock *req,
1522                                   struct dst_entry *dst,
1523                                   struct request_sock *req_unhash,
1524                                   bool *own_req)
1525 {
1526         struct inet_request_sock *ireq;
1527         bool found_dup_sk = false;
1528         struct inet_sock *newinet;
1529         struct tcp_sock *newtp;
1530         struct sock *newsk;
1531 #ifdef CONFIG_TCP_MD5SIG
1532         const union tcp_md5_addr *addr;
1533         struct tcp_md5sig_key *key;
1534         int l3index;
1535 #endif
1536         struct ip_options_rcu *inet_opt;
1537
1538         if (sk_acceptq_is_full(sk))
1539                 goto exit_overflow;
1540
1541         newsk = tcp_create_openreq_child(sk, req, skb);
1542         if (!newsk)
1543                 goto exit_nonewsk;
1544
1545         newsk->sk_gso_type = SKB_GSO_TCPV4;
1546         inet_sk_rx_dst_set(newsk, skb);
1547
1548         newtp                 = tcp_sk(newsk);
1549         newinet               = inet_sk(newsk);
1550         ireq                  = inet_rsk(req);
1551         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1552         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1553         newsk->sk_bound_dev_if = ireq->ir_iif;
1554         newinet->inet_saddr   = ireq->ir_loc_addr;
1555         inet_opt              = rcu_dereference(ireq->ireq_opt);
1556         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1557         newinet->mc_index     = inet_iif(skb);
1558         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1559         newinet->rcv_tos      = ip_hdr(skb)->tos;
1560         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1561         if (inet_opt)
1562                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1563         newinet->inet_id = prandom_u32();
1564
1565         /* Set ToS of the new socket based upon the value of incoming SYN.
1566          * ECT bits are set later in tcp_init_transfer().
1567          */
1568         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1569                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1570
1571         if (!dst) {
1572                 dst = inet_csk_route_child_sock(sk, newsk, req);
1573                 if (!dst)
1574                         goto put_and_exit;
1575         } else {
1576                 /* syncookie case : see end of cookie_v4_check() */
1577         }
1578         sk_setup_caps(newsk, dst);
1579
1580         tcp_ca_openreq_child(newsk, dst);
1581
1582         tcp_sync_mss(newsk, dst_mtu(dst));
1583         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1584
1585         tcp_initialize_rcv_mss(newsk);
1586
1587 #ifdef CONFIG_TCP_MD5SIG
1588         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1589         /* Copy over the MD5 key from the original socket */
1590         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1591         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1592         if (key) {
1593                 /*
1594                  * We're using one, so create a matching key
1595                  * on the newsk structure. If we fail to get
1596                  * memory, then we end up not copying the key
1597                  * across. Shucks.
1598                  */
1599                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1600                                key->key, key->keylen, GFP_ATOMIC);
1601                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1602         }
1603 #endif
1604
1605         if (__inet_inherit_port(sk, newsk) < 0)
1606                 goto put_and_exit;
1607         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1608                                        &found_dup_sk);
1609         if (likely(*own_req)) {
1610                 tcp_move_syn(newtp, req);
1611                 ireq->ireq_opt = NULL;
1612         } else {
1613                 newinet->inet_opt = NULL;
1614
1615                 if (!req_unhash && found_dup_sk) {
1616                         /* This code path should only be executed in the
1617                          * syncookie case only
1618                          */
1619                         bh_unlock_sock(newsk);
1620                         sock_put(newsk);
1621                         newsk = NULL;
1622                 }
1623         }
1624         return newsk;
1625
1626 exit_overflow:
1627         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1628 exit_nonewsk:
1629         dst_release(dst);
1630 exit:
1631         tcp_listendrop(sk);
1632         return NULL;
1633 put_and_exit:
1634         newinet->inet_opt = NULL;
1635         inet_csk_prepare_forced_close(newsk);
1636         tcp_done(newsk);
1637         goto exit;
1638 }
1639 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1640
1641 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1642 {
1643 #ifdef CONFIG_SYN_COOKIES
1644         const struct tcphdr *th = tcp_hdr(skb);
1645
1646         if (!th->syn)
1647                 sk = cookie_v4_check(sk, skb);
1648 #endif
1649         return sk;
1650 }
1651
1652 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1653                          struct tcphdr *th, u32 *cookie)
1654 {
1655         u16 mss = 0;
1656 #ifdef CONFIG_SYN_COOKIES
1657         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1658                                     &tcp_request_sock_ipv4_ops, sk, th);
1659         if (mss) {
1660                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1661                 tcp_synq_overflow(sk);
1662         }
1663 #endif
1664         return mss;
1665 }
1666
1667 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1668                                                            u32));
1669 /* The socket must have it's spinlock held when we get
1670  * here, unless it is a TCP_LISTEN socket.
1671  *
1672  * We have a potential double-lock case here, so even when
1673  * doing backlog processing we use the BH locking scheme.
1674  * This is because we cannot sleep with the original spinlock
1675  * held.
1676  */
1677 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1678 {
1679         struct sock *rsk;
1680
1681         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1682                 struct dst_entry *dst = sk->sk_rx_dst;
1683
1684                 sock_rps_save_rxhash(sk, skb);
1685                 sk_mark_napi_id(sk, skb);
1686                 if (dst) {
1687                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1688                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1689                                              dst, 0)) {
1690                                 dst_release(dst);
1691                                 sk->sk_rx_dst = NULL;
1692                         }
1693                 }
1694                 tcp_rcv_established(sk, skb);
1695                 return 0;
1696         }
1697
1698         if (tcp_checksum_complete(skb))
1699                 goto csum_err;
1700
1701         if (sk->sk_state == TCP_LISTEN) {
1702                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1703
1704                 if (!nsk)
1705                         goto discard;
1706                 if (nsk != sk) {
1707                         if (tcp_child_process(sk, nsk, skb)) {
1708                                 rsk = nsk;
1709                                 goto reset;
1710                         }
1711                         return 0;
1712                 }
1713         } else
1714                 sock_rps_save_rxhash(sk, skb);
1715
1716         if (tcp_rcv_state_process(sk, skb)) {
1717                 rsk = sk;
1718                 goto reset;
1719         }
1720         return 0;
1721
1722 reset:
1723         tcp_v4_send_reset(rsk, skb);
1724 discard:
1725         kfree_skb(skb);
1726         /* Be careful here. If this function gets more complicated and
1727          * gcc suffers from register pressure on the x86, sk (in %ebx)
1728          * might be destroyed here. This current version compiles correctly,
1729          * but you have been warned.
1730          */
1731         return 0;
1732
1733 csum_err:
1734         trace_tcp_bad_csum(skb);
1735         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1736         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1737         goto discard;
1738 }
1739 EXPORT_SYMBOL(tcp_v4_do_rcv);
1740
1741 int tcp_v4_early_demux(struct sk_buff *skb)
1742 {
1743         const struct iphdr *iph;
1744         const struct tcphdr *th;
1745         struct sock *sk;
1746
1747         if (skb->pkt_type != PACKET_HOST)
1748                 return 0;
1749
1750         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1751                 return 0;
1752
1753         iph = ip_hdr(skb);
1754         th = tcp_hdr(skb);
1755
1756         if (th->doff < sizeof(struct tcphdr) / 4)
1757                 return 0;
1758
1759         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1760                                        iph->saddr, th->source,
1761                                        iph->daddr, ntohs(th->dest),
1762                                        skb->skb_iif, inet_sdif(skb));
1763         if (sk) {
1764                 skb->sk = sk;
1765                 skb->destructor = sock_edemux;
1766                 if (sk_fullsock(sk)) {
1767                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1768
1769                         if (dst)
1770                                 dst = dst_check(dst, 0);
1771                         if (dst &&
1772                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1773                                 skb_dst_set_noref(skb, dst);
1774                 }
1775         }
1776         return 0;
1777 }
1778
1779 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1780 {
1781         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1782         u32 tail_gso_size, tail_gso_segs;
1783         struct skb_shared_info *shinfo;
1784         const struct tcphdr *th;
1785         struct tcphdr *thtail;
1786         struct sk_buff *tail;
1787         unsigned int hdrlen;
1788         bool fragstolen;
1789         u32 gso_segs;
1790         u32 gso_size;
1791         int delta;
1792
1793         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1794          * we can fix skb->truesize to its real value to avoid future drops.
1795          * This is valid because skb is not yet charged to the socket.
1796          * It has been noticed pure SACK packets were sometimes dropped
1797          * (if cooked by drivers without copybreak feature).
1798          */
1799         skb_condense(skb);
1800
1801         skb_dst_drop(skb);
1802
1803         if (unlikely(tcp_checksum_complete(skb))) {
1804                 bh_unlock_sock(sk);
1805                 trace_tcp_bad_csum(skb);
1806                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1807                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1808                 return true;
1809         }
1810
1811         /* Attempt coalescing to last skb in backlog, even if we are
1812          * above the limits.
1813          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1814          */
1815         th = (const struct tcphdr *)skb->data;
1816         hdrlen = th->doff * 4;
1817
1818         tail = sk->sk_backlog.tail;
1819         if (!tail)
1820                 goto no_coalesce;
1821         thtail = (struct tcphdr *)tail->data;
1822
1823         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1824             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1825             ((TCP_SKB_CB(tail)->tcp_flags |
1826               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1827             !((TCP_SKB_CB(tail)->tcp_flags &
1828               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1829             ((TCP_SKB_CB(tail)->tcp_flags ^
1830               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1831 #ifdef CONFIG_TLS_DEVICE
1832             tail->decrypted != skb->decrypted ||
1833 #endif
1834             thtail->doff != th->doff ||
1835             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1836                 goto no_coalesce;
1837
1838         __skb_pull(skb, hdrlen);
1839
1840         shinfo = skb_shinfo(skb);
1841         gso_size = shinfo->gso_size ?: skb->len;
1842         gso_segs = shinfo->gso_segs ?: 1;
1843
1844         shinfo = skb_shinfo(tail);
1845         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1846         tail_gso_segs = shinfo->gso_segs ?: 1;
1847
1848         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1849                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1850
1851                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1852                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1853                         thtail->window = th->window;
1854                 }
1855
1856                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1857                  * thtail->fin, so that the fast path in tcp_rcv_established()
1858                  * is not entered if we append a packet with a FIN.
1859                  * SYN, RST, URG are not present.
1860                  * ACK is set on both packets.
1861                  * PSH : we do not really care in TCP stack,
1862                  *       at least for 'GRO' packets.
1863                  */
1864                 thtail->fin |= th->fin;
1865                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1866
1867                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1868                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1869                         tail->tstamp = skb->tstamp;
1870                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1871                 }
1872
1873                 /* Not as strict as GRO. We only need to carry mss max value */
1874                 shinfo->gso_size = max(gso_size, tail_gso_size);
1875                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1876
1877                 sk->sk_backlog.len += delta;
1878                 __NET_INC_STATS(sock_net(sk),
1879                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1880                 kfree_skb_partial(skb, fragstolen);
1881                 return false;
1882         }
1883         __skb_push(skb, hdrlen);
1884
1885 no_coalesce:
1886         /* Only socket owner can try to collapse/prune rx queues
1887          * to reduce memory overhead, so add a little headroom here.
1888          * Few sockets backlog are possibly concurrently non empty.
1889          */
1890         limit += 64*1024;
1891
1892         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1893                 bh_unlock_sock(sk);
1894                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1895                 return true;
1896         }
1897         return false;
1898 }
1899 EXPORT_SYMBOL(tcp_add_backlog);
1900
1901 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1902 {
1903         struct tcphdr *th = (struct tcphdr *)skb->data;
1904
1905         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1906 }
1907 EXPORT_SYMBOL(tcp_filter);
1908
1909 static void tcp_v4_restore_cb(struct sk_buff *skb)
1910 {
1911         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1912                 sizeof(struct inet_skb_parm));
1913 }
1914
1915 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1916                            const struct tcphdr *th)
1917 {
1918         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1919          * barrier() makes sure compiler wont play fool^Waliasing games.
1920          */
1921         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1922                 sizeof(struct inet_skb_parm));
1923         barrier();
1924
1925         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1926         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1927                                     skb->len - th->doff * 4);
1928         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1929         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1930         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1931         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1932         TCP_SKB_CB(skb)->sacked  = 0;
1933         TCP_SKB_CB(skb)->has_rxtstamp =
1934                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1935 }
1936
1937 /*
1938  *      From tcp_input.c
1939  */
1940
1941 int tcp_v4_rcv(struct sk_buff *skb)
1942 {
1943         struct net *net = dev_net(skb->dev);
1944         struct sk_buff *skb_to_free;
1945         int sdif = inet_sdif(skb);
1946         int dif = inet_iif(skb);
1947         const struct iphdr *iph;
1948         const struct tcphdr *th;
1949         bool refcounted;
1950         struct sock *sk;
1951         int ret;
1952
1953         if (skb->pkt_type != PACKET_HOST)
1954                 goto discard_it;
1955
1956         /* Count it even if it's bad */
1957         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1958
1959         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1960                 goto discard_it;
1961
1962         th = (const struct tcphdr *)skb->data;
1963
1964         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1965                 goto bad_packet;
1966         if (!pskb_may_pull(skb, th->doff * 4))
1967                 goto discard_it;
1968
1969         /* An explanation is required here, I think.
1970          * Packet length and doff are validated by header prediction,
1971          * provided case of th->doff==0 is eliminated.
1972          * So, we defer the checks. */
1973
1974         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1975                 goto csum_error;
1976
1977         th = (const struct tcphdr *)skb->data;
1978         iph = ip_hdr(skb);
1979 lookup:
1980         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1981                                th->dest, sdif, &refcounted);
1982         if (!sk)
1983                 goto no_tcp_socket;
1984
1985 process:
1986         if (sk->sk_state == TCP_TIME_WAIT)
1987                 goto do_time_wait;
1988
1989         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990                 struct request_sock *req = inet_reqsk(sk);
1991                 bool req_stolen = false;
1992                 struct sock *nsk;
1993
1994                 sk = req->rsk_listener;
1995                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1996                         sk_drops_add(sk, skb);
1997                         reqsk_put(req);
1998                         goto discard_it;
1999                 }
2000                 if (tcp_checksum_complete(skb)) {
2001                         reqsk_put(req);
2002                         goto csum_error;
2003                 }
2004                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2005                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2006                         if (!nsk) {
2007                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2008                                 goto lookup;
2009                         }
2010                         sk = nsk;
2011                         /* reuseport_migrate_sock() has already held one sk_refcnt
2012                          * before returning.
2013                          */
2014                 } else {
2015                         /* We own a reference on the listener, increase it again
2016                          * as we might lose it too soon.
2017                          */
2018                         sock_hold(sk);
2019                 }
2020                 refcounted = true;
2021                 nsk = NULL;
2022                 if (!tcp_filter(sk, skb)) {
2023                         th = (const struct tcphdr *)skb->data;
2024                         iph = ip_hdr(skb);
2025                         tcp_v4_fill_cb(skb, iph, th);
2026                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2027                 }
2028                 if (!nsk) {
2029                         reqsk_put(req);
2030                         if (req_stolen) {
2031                                 /* Another cpu got exclusive access to req
2032                                  * and created a full blown socket.
2033                                  * Try to feed this packet to this socket
2034                                  * instead of discarding it.
2035                                  */
2036                                 tcp_v4_restore_cb(skb);
2037                                 sock_put(sk);
2038                                 goto lookup;
2039                         }
2040                         goto discard_and_relse;
2041                 }
2042                 if (nsk == sk) {
2043                         reqsk_put(req);
2044                         tcp_v4_restore_cb(skb);
2045                 } else if (tcp_child_process(sk, nsk, skb)) {
2046                         tcp_v4_send_reset(nsk, skb);
2047                         goto discard_and_relse;
2048                 } else {
2049                         sock_put(sk);
2050                         return 0;
2051                 }
2052         }
2053         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2054                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2055                 goto discard_and_relse;
2056         }
2057
2058         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2059                 goto discard_and_relse;
2060
2061         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2062                 goto discard_and_relse;
2063
2064         nf_reset_ct(skb);
2065
2066         if (tcp_filter(sk, skb))
2067                 goto discard_and_relse;
2068         th = (const struct tcphdr *)skb->data;
2069         iph = ip_hdr(skb);
2070         tcp_v4_fill_cb(skb, iph, th);
2071
2072         skb->dev = NULL;
2073
2074         if (sk->sk_state == TCP_LISTEN) {
2075                 ret = tcp_v4_do_rcv(sk, skb);
2076                 goto put_and_return;
2077         }
2078
2079         sk_incoming_cpu_update(sk);
2080
2081         bh_lock_sock_nested(sk);
2082         tcp_segs_in(tcp_sk(sk), skb);
2083         ret = 0;
2084         if (!sock_owned_by_user(sk)) {
2085                 skb_to_free = sk->sk_rx_skb_cache;
2086                 sk->sk_rx_skb_cache = NULL;
2087                 ret = tcp_v4_do_rcv(sk, skb);
2088         } else {
2089                 if (tcp_add_backlog(sk, skb))
2090                         goto discard_and_relse;
2091                 skb_to_free = NULL;
2092         }
2093         bh_unlock_sock(sk);
2094         if (skb_to_free)
2095                 __kfree_skb(skb_to_free);
2096
2097 put_and_return:
2098         if (refcounted)
2099                 sock_put(sk);
2100
2101         return ret;
2102
2103 no_tcp_socket:
2104         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2105                 goto discard_it;
2106
2107         tcp_v4_fill_cb(skb, iph, th);
2108
2109         if (tcp_checksum_complete(skb)) {
2110 csum_error:
2111                 trace_tcp_bad_csum(skb);
2112                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2113 bad_packet:
2114                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2115         } else {
2116                 tcp_v4_send_reset(NULL, skb);
2117         }
2118
2119 discard_it:
2120         /* Discard frame. */
2121         kfree_skb(skb);
2122         return 0;
2123
2124 discard_and_relse:
2125         sk_drops_add(sk, skb);
2126         if (refcounted)
2127                 sock_put(sk);
2128         goto discard_it;
2129
2130 do_time_wait:
2131         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2132                 inet_twsk_put(inet_twsk(sk));
2133                 goto discard_it;
2134         }
2135
2136         tcp_v4_fill_cb(skb, iph, th);
2137
2138         if (tcp_checksum_complete(skb)) {
2139                 inet_twsk_put(inet_twsk(sk));
2140                 goto csum_error;
2141         }
2142         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2143         case TCP_TW_SYN: {
2144                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2145                                                         &tcp_hashinfo, skb,
2146                                                         __tcp_hdrlen(th),
2147                                                         iph->saddr, th->source,
2148                                                         iph->daddr, th->dest,
2149                                                         inet_iif(skb),
2150                                                         sdif);
2151                 if (sk2) {
2152                         inet_twsk_deschedule_put(inet_twsk(sk));
2153                         sk = sk2;
2154                         tcp_v4_restore_cb(skb);
2155                         refcounted = false;
2156                         goto process;
2157                 }
2158         }
2159                 /* to ACK */
2160                 fallthrough;
2161         case TCP_TW_ACK:
2162                 tcp_v4_timewait_ack(sk, skb);
2163                 break;
2164         case TCP_TW_RST:
2165                 tcp_v4_send_reset(sk, skb);
2166                 inet_twsk_deschedule_put(inet_twsk(sk));
2167                 goto discard_it;
2168         case TCP_TW_SUCCESS:;
2169         }
2170         goto discard_it;
2171 }
2172
2173 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2174         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2175         .twsk_unique    = tcp_twsk_unique,
2176         .twsk_destructor= tcp_twsk_destructor,
2177 };
2178
2179 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2180 {
2181         struct dst_entry *dst = skb_dst(skb);
2182
2183         if (dst && dst_hold_safe(dst)) {
2184                 sk->sk_rx_dst = dst;
2185                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2186         }
2187 }
2188 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2189
2190 const struct inet_connection_sock_af_ops ipv4_specific = {
2191         .queue_xmit        = ip_queue_xmit,
2192         .send_check        = tcp_v4_send_check,
2193         .rebuild_header    = inet_sk_rebuild_header,
2194         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2195         .conn_request      = tcp_v4_conn_request,
2196         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2197         .net_header_len    = sizeof(struct iphdr),
2198         .setsockopt        = ip_setsockopt,
2199         .getsockopt        = ip_getsockopt,
2200         .addr2sockaddr     = inet_csk_addr2sockaddr,
2201         .sockaddr_len      = sizeof(struct sockaddr_in),
2202         .mtu_reduced       = tcp_v4_mtu_reduced,
2203 };
2204 EXPORT_SYMBOL(ipv4_specific);
2205
2206 #ifdef CONFIG_TCP_MD5SIG
2207 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2208         .md5_lookup             = tcp_v4_md5_lookup,
2209         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2210         .md5_parse              = tcp_v4_parse_md5_keys,
2211 };
2212 #endif
2213
2214 /* NOTE: A lot of things set to zero explicitly by call to
2215  *       sk_alloc() so need not be done here.
2216  */
2217 static int tcp_v4_init_sock(struct sock *sk)
2218 {
2219         struct inet_connection_sock *icsk = inet_csk(sk);
2220
2221         tcp_init_sock(sk);
2222
2223         icsk->icsk_af_ops = &ipv4_specific;
2224
2225 #ifdef CONFIG_TCP_MD5SIG
2226         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2227 #endif
2228
2229         return 0;
2230 }
2231
2232 void tcp_v4_destroy_sock(struct sock *sk)
2233 {
2234         struct tcp_sock *tp = tcp_sk(sk);
2235
2236         trace_tcp_destroy_sock(sk);
2237
2238         tcp_clear_xmit_timers(sk);
2239
2240         tcp_cleanup_congestion_control(sk);
2241
2242         tcp_cleanup_ulp(sk);
2243
2244         /* Cleanup up the write buffer. */
2245         tcp_write_queue_purge(sk);
2246
2247         /* Check if we want to disable active TFO */
2248         tcp_fastopen_active_disable_ofo_check(sk);
2249
2250         /* Cleans up our, hopefully empty, out_of_order_queue. */
2251         skb_rbtree_purge(&tp->out_of_order_queue);
2252
2253 #ifdef CONFIG_TCP_MD5SIG
2254         /* Clean up the MD5 key list, if any */
2255         if (tp->md5sig_info) {
2256                 tcp_clear_md5_list(sk);
2257                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2258                 tp->md5sig_info = NULL;
2259         }
2260 #endif
2261
2262         /* Clean up a referenced TCP bind bucket. */
2263         if (inet_csk(sk)->icsk_bind_hash)
2264                 inet_put_port(sk);
2265
2266         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2267
2268         /* If socket is aborted during connect operation */
2269         tcp_free_fastopen_req(tp);
2270         tcp_fastopen_destroy_cipher(sk);
2271         tcp_saved_syn_free(tp);
2272
2273         sk_sockets_allocated_dec(sk);
2274 }
2275 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2276
2277 #ifdef CONFIG_PROC_FS
2278 /* Proc filesystem TCP sock list dumping. */
2279
2280 /*
2281  * Get next listener socket follow cur.  If cur is NULL, get first socket
2282  * starting from bucket given in st->bucket; when st->bucket is zero the
2283  * very first socket in the hash table is returned.
2284  */
2285 static void *listening_get_next(struct seq_file *seq, void *cur)
2286 {
2287         struct tcp_seq_afinfo *afinfo;
2288         struct tcp_iter_state *st = seq->private;
2289         struct net *net = seq_file_net(seq);
2290         struct inet_listen_hashbucket *ilb;
2291         struct hlist_nulls_node *node;
2292         struct sock *sk = cur;
2293
2294         if (st->bpf_seq_afinfo)
2295                 afinfo = st->bpf_seq_afinfo;
2296         else
2297                 afinfo = PDE_DATA(file_inode(seq->file));
2298
2299         if (!sk) {
2300 get_head:
2301                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2302                 spin_lock(&ilb->lock);
2303                 sk = sk_nulls_head(&ilb->nulls_head);
2304                 st->offset = 0;
2305                 goto get_sk;
2306         }
2307         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2308         ++st->num;
2309         ++st->offset;
2310
2311         sk = sk_nulls_next(sk);
2312 get_sk:
2313         sk_nulls_for_each_from(sk, node) {
2314                 if (!net_eq(sock_net(sk), net))
2315                         continue;
2316                 if (afinfo->family == AF_UNSPEC ||
2317                     sk->sk_family == afinfo->family)
2318                         return sk;
2319         }
2320         spin_unlock(&ilb->lock);
2321         st->offset = 0;
2322         if (++st->bucket < INET_LHTABLE_SIZE)
2323                 goto get_head;
2324         return NULL;
2325 }
2326
2327 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2328 {
2329         struct tcp_iter_state *st = seq->private;
2330         void *rc;
2331
2332         st->bucket = 0;
2333         st->offset = 0;
2334         rc = listening_get_next(seq, NULL);
2335
2336         while (rc && *pos) {
2337                 rc = listening_get_next(seq, rc);
2338                 --*pos;
2339         }
2340         return rc;
2341 }
2342
2343 static inline bool empty_bucket(const struct tcp_iter_state *st)
2344 {
2345         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2346 }
2347
2348 /*
2349  * Get first established socket starting from bucket given in st->bucket.
2350  * If st->bucket is zero, the very first socket in the hash is returned.
2351  */
2352 static void *established_get_first(struct seq_file *seq)
2353 {
2354         struct tcp_seq_afinfo *afinfo;
2355         struct tcp_iter_state *st = seq->private;
2356         struct net *net = seq_file_net(seq);
2357         void *rc = NULL;
2358
2359         if (st->bpf_seq_afinfo)
2360                 afinfo = st->bpf_seq_afinfo;
2361         else
2362                 afinfo = PDE_DATA(file_inode(seq->file));
2363
2364         st->offset = 0;
2365         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2366                 struct sock *sk;
2367                 struct hlist_nulls_node *node;
2368                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2369
2370                 /* Lockless fast path for the common case of empty buckets */
2371                 if (empty_bucket(st))
2372                         continue;
2373
2374                 spin_lock_bh(lock);
2375                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2376                         if ((afinfo->family != AF_UNSPEC &&
2377                              sk->sk_family != afinfo->family) ||
2378                             !net_eq(sock_net(sk), net)) {
2379                                 continue;
2380                         }
2381                         rc = sk;
2382                         goto out;
2383                 }
2384                 spin_unlock_bh(lock);
2385         }
2386 out:
2387         return rc;
2388 }
2389
2390 static void *established_get_next(struct seq_file *seq, void *cur)
2391 {
2392         struct tcp_seq_afinfo *afinfo;
2393         struct sock *sk = cur;
2394         struct hlist_nulls_node *node;
2395         struct tcp_iter_state *st = seq->private;
2396         struct net *net = seq_file_net(seq);
2397
2398         if (st->bpf_seq_afinfo)
2399                 afinfo = st->bpf_seq_afinfo;
2400         else
2401                 afinfo = PDE_DATA(file_inode(seq->file));
2402
2403         ++st->num;
2404         ++st->offset;
2405
2406         sk = sk_nulls_next(sk);
2407
2408         sk_nulls_for_each_from(sk, node) {
2409                 if ((afinfo->family == AF_UNSPEC ||
2410                      sk->sk_family == afinfo->family) &&
2411                     net_eq(sock_net(sk), net))
2412                         return sk;
2413         }
2414
2415         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2416         ++st->bucket;
2417         return established_get_first(seq);
2418 }
2419
2420 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2421 {
2422         struct tcp_iter_state *st = seq->private;
2423         void *rc;
2424
2425         st->bucket = 0;
2426         rc = established_get_first(seq);
2427
2428         while (rc && pos) {
2429                 rc = established_get_next(seq, rc);
2430                 --pos;
2431         }
2432         return rc;
2433 }
2434
2435 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2436 {
2437         void *rc;
2438         struct tcp_iter_state *st = seq->private;
2439
2440         st->state = TCP_SEQ_STATE_LISTENING;
2441         rc        = listening_get_idx(seq, &pos);
2442
2443         if (!rc) {
2444                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2445                 rc        = established_get_idx(seq, pos);
2446         }
2447
2448         return rc;
2449 }
2450
2451 static void *tcp_seek_last_pos(struct seq_file *seq)
2452 {
2453         struct tcp_iter_state *st = seq->private;
2454         int offset = st->offset;
2455         int orig_num = st->num;
2456         void *rc = NULL;
2457
2458         switch (st->state) {
2459         case TCP_SEQ_STATE_LISTENING:
2460                 if (st->bucket >= INET_LHTABLE_SIZE)
2461                         break;
2462                 st->state = TCP_SEQ_STATE_LISTENING;
2463                 rc = listening_get_next(seq, NULL);
2464                 while (offset-- && rc)
2465                         rc = listening_get_next(seq, rc);
2466                 if (rc)
2467                         break;
2468                 st->bucket = 0;
2469                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2470                 fallthrough;
2471         case TCP_SEQ_STATE_ESTABLISHED:
2472                 if (st->bucket > tcp_hashinfo.ehash_mask)
2473                         break;
2474                 rc = established_get_first(seq);
2475                 while (offset-- && rc)
2476                         rc = established_get_next(seq, rc);
2477         }
2478
2479         st->num = orig_num;
2480
2481         return rc;
2482 }
2483
2484 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2485 {
2486         struct tcp_iter_state *st = seq->private;
2487         void *rc;
2488
2489         if (*pos && *pos == st->last_pos) {
2490                 rc = tcp_seek_last_pos(seq);
2491                 if (rc)
2492                         goto out;
2493         }
2494
2495         st->state = TCP_SEQ_STATE_LISTENING;
2496         st->num = 0;
2497         st->bucket = 0;
2498         st->offset = 0;
2499         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2500
2501 out:
2502         st->last_pos = *pos;
2503         return rc;
2504 }
2505 EXPORT_SYMBOL(tcp_seq_start);
2506
2507 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2508 {
2509         struct tcp_iter_state *st = seq->private;
2510         void *rc = NULL;
2511
2512         if (v == SEQ_START_TOKEN) {
2513                 rc = tcp_get_idx(seq, 0);
2514                 goto out;
2515         }
2516
2517         switch (st->state) {
2518         case TCP_SEQ_STATE_LISTENING:
2519                 rc = listening_get_next(seq, v);
2520                 if (!rc) {
2521                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2522                         st->bucket = 0;
2523                         st->offset = 0;
2524                         rc        = established_get_first(seq);
2525                 }
2526                 break;
2527         case TCP_SEQ_STATE_ESTABLISHED:
2528                 rc = established_get_next(seq, v);
2529                 break;
2530         }
2531 out:
2532         ++*pos;
2533         st->last_pos = *pos;
2534         return rc;
2535 }
2536 EXPORT_SYMBOL(tcp_seq_next);
2537
2538 void tcp_seq_stop(struct seq_file *seq, void *v)
2539 {
2540         struct tcp_iter_state *st = seq->private;
2541
2542         switch (st->state) {
2543         case TCP_SEQ_STATE_LISTENING:
2544                 if (v != SEQ_START_TOKEN)
2545                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2546                 break;
2547         case TCP_SEQ_STATE_ESTABLISHED:
2548                 if (v)
2549                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2550                 break;
2551         }
2552 }
2553 EXPORT_SYMBOL(tcp_seq_stop);
2554
2555 static void get_openreq4(const struct request_sock *req,
2556                          struct seq_file *f, int i)
2557 {
2558         const struct inet_request_sock *ireq = inet_rsk(req);
2559         long delta = req->rsk_timer.expires - jiffies;
2560
2561         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2562                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2563                 i,
2564                 ireq->ir_loc_addr,
2565                 ireq->ir_num,
2566                 ireq->ir_rmt_addr,
2567                 ntohs(ireq->ir_rmt_port),
2568                 TCP_SYN_RECV,
2569                 0, 0, /* could print option size, but that is af dependent. */
2570                 1,    /* timers active (only the expire timer) */
2571                 jiffies_delta_to_clock_t(delta),
2572                 req->num_timeout,
2573                 from_kuid_munged(seq_user_ns(f),
2574                                  sock_i_uid(req->rsk_listener)),
2575                 0,  /* non standard timer */
2576                 0, /* open_requests have no inode */
2577                 0,
2578                 req);
2579 }
2580
2581 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2582 {
2583         int timer_active;
2584         unsigned long timer_expires;
2585         const struct tcp_sock *tp = tcp_sk(sk);
2586         const struct inet_connection_sock *icsk = inet_csk(sk);
2587         const struct inet_sock *inet = inet_sk(sk);
2588         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2589         __be32 dest = inet->inet_daddr;
2590         __be32 src = inet->inet_rcv_saddr;
2591         __u16 destp = ntohs(inet->inet_dport);
2592         __u16 srcp = ntohs(inet->inet_sport);
2593         int rx_queue;
2594         int state;
2595
2596         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2597             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2598             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2599                 timer_active    = 1;
2600                 timer_expires   = icsk->icsk_timeout;
2601         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2602                 timer_active    = 4;
2603                 timer_expires   = icsk->icsk_timeout;
2604         } else if (timer_pending(&sk->sk_timer)) {
2605                 timer_active    = 2;
2606                 timer_expires   = sk->sk_timer.expires;
2607         } else {
2608                 timer_active    = 0;
2609                 timer_expires = jiffies;
2610         }
2611
2612         state = inet_sk_state_load(sk);
2613         if (state == TCP_LISTEN)
2614                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2615         else
2616                 /* Because we don't lock the socket,
2617                  * we might find a transient negative value.
2618                  */
2619                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2620                                       READ_ONCE(tp->copied_seq), 0);
2621
2622         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2623                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2624                 i, src, srcp, dest, destp, state,
2625                 READ_ONCE(tp->write_seq) - tp->snd_una,
2626                 rx_queue,
2627                 timer_active,
2628                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2629                 icsk->icsk_retransmits,
2630                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2631                 icsk->icsk_probes_out,
2632                 sock_i_ino(sk),
2633                 refcount_read(&sk->sk_refcnt), sk,
2634                 jiffies_to_clock_t(icsk->icsk_rto),
2635                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2636                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2637                 tp->snd_cwnd,
2638                 state == TCP_LISTEN ?
2639                     fastopenq->max_qlen :
2640                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2641 }
2642
2643 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2644                                struct seq_file *f, int i)
2645 {
2646         long delta = tw->tw_timer.expires - jiffies;
2647         __be32 dest, src;
2648         __u16 destp, srcp;
2649
2650         dest  = tw->tw_daddr;
2651         src   = tw->tw_rcv_saddr;
2652         destp = ntohs(tw->tw_dport);
2653         srcp  = ntohs(tw->tw_sport);
2654
2655         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2656                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2657                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2658                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2659                 refcount_read(&tw->tw_refcnt), tw);
2660 }
2661
2662 #define TMPSZ 150
2663
2664 static int tcp4_seq_show(struct seq_file *seq, void *v)
2665 {
2666         struct tcp_iter_state *st;
2667         struct sock *sk = v;
2668
2669         seq_setwidth(seq, TMPSZ - 1);
2670         if (v == SEQ_START_TOKEN) {
2671                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2672                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2673                            "inode");
2674                 goto out;
2675         }
2676         st = seq->private;
2677
2678         if (sk->sk_state == TCP_TIME_WAIT)
2679                 get_timewait4_sock(v, seq, st->num);
2680         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2681                 get_openreq4(v, seq, st->num);
2682         else
2683                 get_tcp4_sock(v, seq, st->num);
2684 out:
2685         seq_pad(seq, '\n');
2686         return 0;
2687 }
2688
2689 #ifdef CONFIG_BPF_SYSCALL
2690 struct bpf_iter__tcp {
2691         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2692         __bpf_md_ptr(struct sock_common *, sk_common);
2693         uid_t uid __aligned(8);
2694 };
2695
2696 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2697                              struct sock_common *sk_common, uid_t uid)
2698 {
2699         struct bpf_iter__tcp ctx;
2700
2701         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2702         ctx.meta = meta;
2703         ctx.sk_common = sk_common;
2704         ctx.uid = uid;
2705         return bpf_iter_run_prog(prog, &ctx);
2706 }
2707
2708 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2709 {
2710         struct bpf_iter_meta meta;
2711         struct bpf_prog *prog;
2712         struct sock *sk = v;
2713         uid_t uid;
2714
2715         if (v == SEQ_START_TOKEN)
2716                 return 0;
2717
2718         if (sk->sk_state == TCP_TIME_WAIT) {
2719                 uid = 0;
2720         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2721                 const struct request_sock *req = v;
2722
2723                 uid = from_kuid_munged(seq_user_ns(seq),
2724                                        sock_i_uid(req->rsk_listener));
2725         } else {
2726                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2727         }
2728
2729         meta.seq = seq;
2730         prog = bpf_iter_get_info(&meta, false);
2731         return tcp_prog_seq_show(prog, &meta, v, uid);
2732 }
2733
2734 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2735 {
2736         struct bpf_iter_meta meta;
2737         struct bpf_prog *prog;
2738
2739         if (!v) {
2740                 meta.seq = seq;
2741                 prog = bpf_iter_get_info(&meta, true);
2742                 if (prog)
2743                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2744         }
2745
2746         tcp_seq_stop(seq, v);
2747 }
2748
2749 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2750         .show           = bpf_iter_tcp_seq_show,
2751         .start          = tcp_seq_start,
2752         .next           = tcp_seq_next,
2753         .stop           = bpf_iter_tcp_seq_stop,
2754 };
2755 #endif
2756
2757 static const struct seq_operations tcp4_seq_ops = {
2758         .show           = tcp4_seq_show,
2759         .start          = tcp_seq_start,
2760         .next           = tcp_seq_next,
2761         .stop           = tcp_seq_stop,
2762 };
2763
2764 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2765         .family         = AF_INET,
2766 };
2767
2768 static int __net_init tcp4_proc_init_net(struct net *net)
2769 {
2770         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2771                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2772                 return -ENOMEM;
2773         return 0;
2774 }
2775
2776 static void __net_exit tcp4_proc_exit_net(struct net *net)
2777 {
2778         remove_proc_entry("tcp", net->proc_net);
2779 }
2780
2781 static struct pernet_operations tcp4_net_ops = {
2782         .init = tcp4_proc_init_net,
2783         .exit = tcp4_proc_exit_net,
2784 };
2785
2786 int __init tcp4_proc_init(void)
2787 {
2788         return register_pernet_subsys(&tcp4_net_ops);
2789 }
2790
2791 void tcp4_proc_exit(void)
2792 {
2793         unregister_pernet_subsys(&tcp4_net_ops);
2794 }
2795 #endif /* CONFIG_PROC_FS */
2796
2797 /* @wake is one when sk_stream_write_space() calls us.
2798  * This sends EPOLLOUT only if notsent_bytes is half the limit.
2799  * This mimics the strategy used in sock_def_write_space().
2800  */
2801 bool tcp_stream_memory_free(const struct sock *sk, int wake)
2802 {
2803         const struct tcp_sock *tp = tcp_sk(sk);
2804         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2805                             READ_ONCE(tp->snd_nxt);
2806
2807         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2808 }
2809 EXPORT_SYMBOL(tcp_stream_memory_free);
2810
2811 struct proto tcp_prot = {
2812         .name                   = "TCP",
2813         .owner                  = THIS_MODULE,
2814         .close                  = tcp_close,
2815         .pre_connect            = tcp_v4_pre_connect,
2816         .connect                = tcp_v4_connect,
2817         .disconnect             = tcp_disconnect,
2818         .accept                 = inet_csk_accept,
2819         .ioctl                  = tcp_ioctl,
2820         .init                   = tcp_v4_init_sock,
2821         .destroy                = tcp_v4_destroy_sock,
2822         .shutdown               = tcp_shutdown,
2823         .setsockopt             = tcp_setsockopt,
2824         .getsockopt             = tcp_getsockopt,
2825         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
2826         .keepalive              = tcp_set_keepalive,
2827         .recvmsg                = tcp_recvmsg,
2828         .sendmsg                = tcp_sendmsg,
2829         .sendpage               = tcp_sendpage,
2830         .backlog_rcv            = tcp_v4_do_rcv,
2831         .release_cb             = tcp_release_cb,
2832         .hash                   = inet_hash,
2833         .unhash                 = inet_unhash,
2834         .get_port               = inet_csk_get_port,
2835 #ifdef CONFIG_BPF_SYSCALL
2836         .psock_update_sk_prot   = tcp_bpf_update_proto,
2837 #endif
2838         .enter_memory_pressure  = tcp_enter_memory_pressure,
2839         .leave_memory_pressure  = tcp_leave_memory_pressure,
2840         .stream_memory_free     = tcp_stream_memory_free,
2841         .sockets_allocated      = &tcp_sockets_allocated,
2842         .orphan_count           = &tcp_orphan_count,
2843         .memory_allocated       = &tcp_memory_allocated,
2844         .memory_pressure        = &tcp_memory_pressure,
2845         .sysctl_mem             = sysctl_tcp_mem,
2846         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2847         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2848         .max_header             = MAX_TCP_HEADER,
2849         .obj_size               = sizeof(struct tcp_sock),
2850         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2851         .twsk_prot              = &tcp_timewait_sock_ops,
2852         .rsk_prot               = &tcp_request_sock_ops,
2853         .h.hashinfo             = &tcp_hashinfo,
2854         .no_autobind            = true,
2855         .diag_destroy           = tcp_abort,
2856 };
2857 EXPORT_SYMBOL(tcp_prot);
2858
2859 static void __net_exit tcp_sk_exit(struct net *net)
2860 {
2861         int cpu;
2862
2863         if (net->ipv4.tcp_congestion_control)
2864                 bpf_module_put(net->ipv4.tcp_congestion_control,
2865                                net->ipv4.tcp_congestion_control->owner);
2866
2867         for_each_possible_cpu(cpu)
2868                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2869         free_percpu(net->ipv4.tcp_sk);
2870 }
2871
2872 static int __net_init tcp_sk_init(struct net *net)
2873 {
2874         int res, cpu, cnt;
2875
2876         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2877         if (!net->ipv4.tcp_sk)
2878                 return -ENOMEM;
2879
2880         for_each_possible_cpu(cpu) {
2881                 struct sock *sk;
2882
2883                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2884                                            IPPROTO_TCP, net);
2885                 if (res)
2886                         goto fail;
2887                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2888
2889                 /* Please enforce IP_DF and IPID==0 for RST and
2890                  * ACK sent in SYN-RECV and TIME-WAIT state.
2891                  */
2892                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2893
2894                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2895         }
2896
2897         net->ipv4.sysctl_tcp_ecn = 2;
2898         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2899
2900         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2901         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2902         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2903         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2904         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2905
2906         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2907         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2908         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2909
2910         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2911         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2912         net->ipv4.sysctl_tcp_syncookies = 1;
2913         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2914         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2915         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2916         net->ipv4.sysctl_tcp_orphan_retries = 0;
2917         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2918         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2919         net->ipv4.sysctl_tcp_tw_reuse = 2;
2920         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2921
2922         cnt = tcp_hashinfo.ehash_mask + 1;
2923         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2924         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2925
2926         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2927         net->ipv4.sysctl_tcp_sack = 1;
2928         net->ipv4.sysctl_tcp_window_scaling = 1;
2929         net->ipv4.sysctl_tcp_timestamps = 1;
2930         net->ipv4.sysctl_tcp_early_retrans = 3;
2931         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2932         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2933         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2934         net->ipv4.sysctl_tcp_max_reordering = 300;
2935         net->ipv4.sysctl_tcp_dsack = 1;
2936         net->ipv4.sysctl_tcp_app_win = 31;
2937         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2938         net->ipv4.sysctl_tcp_frto = 2;
2939         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2940         /* This limits the percentage of the congestion window which we
2941          * will allow a single TSO frame to consume.  Building TSO frames
2942          * which are too large can cause TCP streams to be bursty.
2943          */
2944         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2945         /* Default TSQ limit of 16 TSO segments */
2946         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2947         /* rfc5961 challenge ack rate limiting */
2948         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2949         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2950         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2951         net->ipv4.sysctl_tcp_autocorking = 1;
2952         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2953         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2954         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2955         if (net != &init_net) {
2956                 memcpy(net->ipv4.sysctl_tcp_rmem,
2957                        init_net.ipv4.sysctl_tcp_rmem,
2958                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2959                 memcpy(net->ipv4.sysctl_tcp_wmem,
2960                        init_net.ipv4.sysctl_tcp_wmem,
2961                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2962         }
2963         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2964         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2965         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2966         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2967         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2968         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2969         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2970
2971         /* Reno is always built in */
2972         if (!net_eq(net, &init_net) &&
2973             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2974                                init_net.ipv4.tcp_congestion_control->owner))
2975                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2976         else
2977                 net->ipv4.tcp_congestion_control = &tcp_reno;
2978
2979         return 0;
2980 fail:
2981         tcp_sk_exit(net);
2982
2983         return res;
2984 }
2985
2986 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2987 {
2988         struct net *net;
2989
2990         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2991
2992         list_for_each_entry(net, net_exit_list, exit_list)
2993                 tcp_fastopen_ctx_destroy(net);
2994 }
2995
2996 static struct pernet_operations __net_initdata tcp_sk_ops = {
2997        .init       = tcp_sk_init,
2998        .exit       = tcp_sk_exit,
2999        .exit_batch = tcp_sk_exit_batch,
3000 };
3001
3002 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3003 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3004                      struct sock_common *sk_common, uid_t uid)
3005
3006 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3007 {
3008         struct tcp_iter_state *st = priv_data;
3009         struct tcp_seq_afinfo *afinfo;
3010         int ret;
3011
3012         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
3013         if (!afinfo)
3014                 return -ENOMEM;
3015
3016         afinfo->family = AF_UNSPEC;
3017         st->bpf_seq_afinfo = afinfo;
3018         ret = bpf_iter_init_seq_net(priv_data, aux);
3019         if (ret)
3020                 kfree(afinfo);
3021         return ret;
3022 }
3023
3024 static void bpf_iter_fini_tcp(void *priv_data)
3025 {
3026         struct tcp_iter_state *st = priv_data;
3027
3028         kfree(st->bpf_seq_afinfo);
3029         bpf_iter_fini_seq_net(priv_data);
3030 }
3031
3032 static const struct bpf_iter_seq_info tcp_seq_info = {
3033         .seq_ops                = &bpf_iter_tcp_seq_ops,
3034         .init_seq_private       = bpf_iter_init_tcp,
3035         .fini_seq_private       = bpf_iter_fini_tcp,
3036         .seq_priv_size          = sizeof(struct tcp_iter_state),
3037 };
3038
3039 static struct bpf_iter_reg tcp_reg_info = {
3040         .target                 = "tcp",
3041         .ctx_arg_info_size      = 1,
3042         .ctx_arg_info           = {
3043                 { offsetof(struct bpf_iter__tcp, sk_common),
3044                   PTR_TO_BTF_ID_OR_NULL },
3045         },
3046         .seq_info               = &tcp_seq_info,
3047 };
3048
3049 static void __init bpf_iter_register(void)
3050 {
3051         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3052         if (bpf_iter_reg_target(&tcp_reg_info))
3053                 pr_warn("Warning: could not register bpf iterator tcp\n");
3054 }
3055
3056 #endif
3057
3058 void __init tcp_v4_init(void)
3059 {
3060         if (register_pernet_subsys(&tcp_sk_ops))
3061                 panic("Failed to create the TCP control socket.\n");
3062
3063 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3064         bpf_iter_register();
3065 #endif
3066 }