net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = tcp_sk(sk)->mtu_info;
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         tp->mtu_info = info;
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk->sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk->sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 659 {
 660         const struct tcphdr *th = tcp_hdr(skb);
 661         struct {
 662                 struct tcphdr th;
 663 #ifdef CONFIG_TCP_MD5SIG
 664                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 665 #endif
 666         } rep;
 667         struct ip_reply_arg arg;
 668 #ifdef CONFIG_TCP_MD5SIG
 669         struct tcp_md5sig_key *key = NULL;
 670         const __u8 *hash_location = NULL;
 671         unsigned char newhash[16];
 672         int genhash;
 673         struct sock *sk1 = NULL;
 674 #endif
 675         u64 transmit_time = 0;
 676         struct sock *ctl_sk;
 677         struct net *net;
 678
 679         /* Never send a reset in response to a reset. */
 680         if (th->rst)
 681                 return;
 682
 683         /* If sk not NULL, it means we did a successful lookup and incoming
 684          * route had to be correct. prequeue might have dropped our dst.
 685          */
 686         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rep, 0, sizeof(rep));
 691         rep.th.dest   = th->source;
 692         rep.th.source = th->dest;
 693         rep.th.doff   = sizeof(struct tcphdr) / 4;
 694         rep.th.rst    = 1;
 695
 696         if (th->ack) {
 697                 rep.th.seq = th->ack_seq;
 698         } else {
 699                 rep.th.ack = 1;
 700                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                        skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof(arg));
 705         arg.iov[0].iov_base = (unsigned char *)&rep;
 706         arg.iov[0].iov_len  = sizeof(rep.th);
 707
 708         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 709 #ifdef CONFIG_TCP_MD5SIG
 710         rcu_read_lock();
 711         hash_location = tcp_parse_md5sig_option(th);
 712         if (sk && sk_fullsock(sk)) {
 713                 const union tcp_md5_addr *addr;
 714                 int l3index;
 715
 716                 /* sdif set, means packet ingressed via a device
 717                  * in an L3 domain and inet_iif is set to it.
 718                  */
 719                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 720                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 721                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 722         } else if (hash_location) {
 723                 const union tcp_md5_addr *addr;
 724                 int sdif = tcp_v4_sdif(skb);
 725                 int dif = inet_iif(skb);
 726                 int l3index;
 727
 728                 /*
 729                  * active side is lost. Try to find listening socket through
 730                  * source port, and then find md5 key through listening socket.
 731                  * we are not loose security here:
 732                  * Incoming packet is checked with md5 hash with finding key,
 733                  * no RST generated if md5 hash doesn't match.
 734                  */
 735                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 736                                              ip_hdr(skb)->saddr,
 737                                              th->source, ip_hdr(skb)->daddr,
 738                                              ntohs(th->source), dif, sdif);
 739                 /* don't send rst if it can't find key */
 740                 if (!sk1)
 741                         goto out;
 742
 743                 /* sdif set, means packet ingressed via a device
 744                  * in an L3 domain and dif is set to it.
 745                  */
 746                 l3index = sdif ? dif : 0;
 747                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 748                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 749                 if (!key)
 750                         goto out;
 751
 752
 753                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 754                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 755                         goto out;
 756
 757         }
 758
 759         if (key) {
 760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 761                                    (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_MD5SIG << 8) |
 763                                    TCPOLEN_MD5SIG);
 764                 /* Update length and the length the header thinks exists */
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len / 4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 769                                      key, ip_hdr(skb)->saddr,
 770                                      ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 774                                       ip_hdr(skb)->saddr, /* XXX */
 775                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 776         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 777         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 778
 779         /* When socket is gone, all binding information is lost.
 780          * routing might fail in this case. No choice here, if we choose to force
 781          * input interface, we will misroute in case of asymmetric route.
 782          */
 783         if (sk) {
 784                 arg.bound_dev_if = sk->sk_bound_dev_if;
 785                 if (sk_fullsock(sk))
 786                         trace_tcp_send_reset(sk, skb);
 787         }
 788
 789         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 790                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 791
 792         arg.tos = ip_hdr(skb)->tos;
 793         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 794         local_bh_disable();
 795         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 796         if (sk) {
 797                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 798                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 799                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 800                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 801                 transmit_time = tcp_transmit_time(sk);
 802         }
 803         ip_send_unicast_reply(ctl_sk,
 804                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 805                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 806                               &arg, arg.iov[0].iov_len,
 807                               transmit_time);
 808
 809         ctl_sk->sk_mark = 0;
 810         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 811         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 812         local_bh_enable();
 813
 814 #ifdef CONFIG_TCP_MD5SIG
 815 out:
 816         rcu_read_unlock();
 817 #endif
 818 }
 819
 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 821    outside socket context is ugly, certainly. What can I do?
 822  */
 823
 824 static void tcp_v4_send_ack(const struct sock *sk,
 825                             struct sk_buff *skb, u32 seq, u32 ack,
 826                             u32 win, u32 tsval, u32 tsecr, int oif,
 827                             struct tcp_md5sig_key *key,
 828                             int reply_flags, u8 tos)
 829 {
 830         const struct tcphdr *th = tcp_hdr(skb);
 831         struct {
 832                 struct tcphdr th;
 833                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 834 #ifdef CONFIG_TCP_MD5SIG
 835                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 836 #endif
 837                         ];
 838         } rep;
 839         struct net *net = sock_net(sk);
 840         struct ip_reply_arg arg;
 841         struct sock *ctl_sk;
 842         u64 transmit_time;
 843
 844         memset(&rep.th, 0, sizeof(struct tcphdr));
 845         memset(&arg, 0, sizeof(arg));
 846
 847         arg.iov[0].iov_base = (unsigned char *)&rep;
 848         arg.iov[0].iov_len  = sizeof(rep.th);
 849         if (tsecr) {
 850                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 851                                    (TCPOPT_TIMESTAMP << 8) |
 852                                    TCPOLEN_TIMESTAMP);
 853                 rep.opt[1] = htonl(tsval);
 854                 rep.opt[2] = htonl(tsecr);
 855                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 856         }
 857
 858         /* Swap the send and the receive. */
 859         rep.th.dest    = th->source;
 860         rep.th.source  = th->dest;
 861         rep.th.doff    = arg.iov[0].iov_len / 4;
 862         rep.th.seq     = htonl(seq);
 863         rep.th.ack_seq = htonl(ack);
 864         rep.th.ack     = 1;
 865         rep.th.window  = htons(win);
 866
 867 #ifdef CONFIG_TCP_MD5SIG
 868         if (key) {
 869                 int offset = (tsecr) ? 3 : 0;
 870
 871                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 872                                           (TCPOPT_NOP << 16) |
 873                                           (TCPOPT_MD5SIG << 8) |
 874                                           TCPOLEN_MD5SIG);
 875                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 876                 rep.th.doff = arg.iov[0].iov_len/4;
 877
 878                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 879                                     key, ip_hdr(skb)->saddr,
 880                                     ip_hdr(skb)->daddr, &rep.th);
 881         }
 882 #endif
 883         arg.flags = reply_flags;
 884         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 885                                       ip_hdr(skb)->saddr, /* XXX */
 886                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 887         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 888         if (oif)
 889                 arg.bound_dev_if = oif;
 890         arg.tos = tos;
 891         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 892         local_bh_disable();
 893         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 894         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 895                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 896         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 897                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 898         transmit_time = tcp_transmit_time(sk);
 899         ip_send_unicast_reply(ctl_sk,
 900                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 901                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 902                               &arg, arg.iov[0].iov_len,
 903                               transmit_time);
 904
 905         ctl_sk->sk_mark = 0;
 906         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 907         local_bh_enable();
 908 }
 909
 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 911 {
 912         struct inet_timewait_sock *tw = inet_twsk(sk);
 913         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 914
 915         tcp_v4_send_ack(sk, skb,
 916                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 917                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 918                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 919                         tcptw->tw_ts_recent,
 920                         tw->tw_bound_dev_if,
 921                         tcp_twsk_md5_key(tcptw),
 922                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                         tw->tw_tos
 924                         );
 925
 926         inet_twsk_put(tw);
 927 }
 928
 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 930                                   struct request_sock *req)
 931 {
 932         const union tcp_md5_addr *addr;
 933         int l3index;
 934
 935         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 936          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 937          */
 938         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 939                                              tcp_sk(sk)->snd_nxt;
 940
 941         /* RFC 7323 2.3
 942          * The window field (SEG.WND) of every outgoing segment, with the
 943          * exception of <SYN> segments, MUST be right-shifted by
 944          * Rcv.Wind.Shift bits:
 945          */
 946         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 947         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 948         tcp_v4_send_ack(sk, skb, seq,
 949                         tcp_rsk(req)->rcv_nxt,
 950                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 951                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 952                         req->ts_recent,
 953                         0,
 954                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 955                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 956                         ip_hdr(skb)->tos);
 957 }
 958
 959 /*
 960  *      Send a SYN-ACK after having received a SYN.
 961  *      This still operates on a request_sock only, not on a big
 962  *      socket.
 963  */
 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 965                               struct flowi *fl,
 966                               struct request_sock *req,
 967                               struct tcp_fastopen_cookie *foc,
 968                               enum tcp_synack_type synack_type,
 969                               struct sk_buff *syn_skb)
 970 {
 971         const struct inet_request_sock *ireq = inet_rsk(req);
 972         struct flowi4 fl4;
 973         int err = -1;
 974         struct sk_buff *skb;
 975         u8 tos;
 976
 977         /* First, grab a route. */
 978         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 979                 return -1;
 980
 981         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 982
 983         if (skb) {
 984                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 985
 986                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
 987                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
 988                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
 989                                 inet_sk(sk)->tos;
 990
 991                 if (!INET_ECN_is_capable(tos) &&
 992                     tcp_bpf_ca_needs_ecn((struct sock *)req))
 993                         tos |= INET_ECN_ECT_0;
 994
 995                 rcu_read_lock();
 996                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 997                                             ireq->ir_rmt_addr,
 998                                             rcu_dereference(ireq->ireq_opt),
 999                                             tos);
1000                 rcu_read_unlock();
1001                 err = net_xmit_eval(err);
1002         }
1003
1004         return err;
1005 }
1006
1007 /*
1008  *      IPv4 request_sock destructor.
1009  */
1010 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011 {
1012         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1013 }
1014
1015 #ifdef CONFIG_TCP_MD5SIG
1016 /*
1017  * RFC2385 MD5 checksumming requires a mapping of
1018  * IP address->MD5 Key.
1019  * We need to maintain these in the sk structure.
1020  */
1021
1022 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023 EXPORT_SYMBOL(tcp_md5_needed);
1024
1025 /* Find the Key structure for an address.  */
1026 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1027                                            const union tcp_md5_addr *addr,
1028                                            int family)
1029 {
1030         const struct tcp_sock *tp = tcp_sk(sk);
1031         struct tcp_md5sig_key *key;
1032         const struct tcp_md5sig_info *md5sig;
1033         __be32 mask;
1034         struct tcp_md5sig_key *best_match = NULL;
1035         bool match;
1036
1037         /* caller either holds rcu_read_lock() or socket lock */
1038         md5sig = rcu_dereference_check(tp->md5sig_info,
1039                                        lockdep_sock_is_held(sk));
1040         if (!md5sig)
1041                 return NULL;
1042
1043         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1044                                  lockdep_sock_is_held(sk)) {
1045                 if (key->family != family)
1046                         continue;
1047                 if (key->l3index && key->l3index != l3index)
1048                         continue;
1049                 if (family == AF_INET) {
1050                         mask = inet_make_mask(key->prefixlen);
1051                         match = (key->addr.a4.s_addr & mask) ==
1052                                 (addr->a4.s_addr & mask);
1053 #if IS_ENABLED(CONFIG_IPV6)
1054                 } else if (family == AF_INET6) {
1055                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1056                                                   key->prefixlen);
1057 #endif
1058                 } else {
1059                         match = false;
1060                 }
1061
1062                 if (match && (!best_match ||
1063                               key->prefixlen > best_match->prefixlen))
1064                         best_match = key;
1065         }
1066         return best_match;
1067 }
1068 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1069
1070 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1071                                                       const union tcp_md5_addr *addr,
1072                                                       int family, u8 prefixlen,
1073                                                       int l3index)
1074 {
1075         const struct tcp_sock *tp = tcp_sk(sk);
1076         struct tcp_md5sig_key *key;
1077         unsigned int size = sizeof(struct in_addr);
1078         const struct tcp_md5sig_info *md5sig;
1079
1080         /* caller either holds rcu_read_lock() or socket lock */
1081         md5sig = rcu_dereference_check(tp->md5sig_info,
1082                                        lockdep_sock_is_held(sk));
1083         if (!md5sig)
1084                 return NULL;
1085 #if IS_ENABLED(CONFIG_IPV6)
1086         if (family == AF_INET6)
1087                 size = sizeof(struct in6_addr);
1088 #endif
1089         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1090                                  lockdep_sock_is_held(sk)) {
1091                 if (key->family != family)
1092                         continue;
1093                 if (key->l3index && key->l3index != l3index)
1094                         continue;
1095                 if (!memcmp(&key->addr, addr, size) &&
1096                     key->prefixlen == prefixlen)
1097                         return key;
1098         }
1099         return NULL;
1100 }
1101
1102 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1103                                          const struct sock *addr_sk)
1104 {
1105         const union tcp_md5_addr *addr;
1106         int l3index;
1107
1108         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1109                                                  addr_sk->sk_bound_dev_if);
1110         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1111         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1112 }
1113 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1114
1115 /* This can be called on a newly created socket, from other files */
1116 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1117                    int family, u8 prefixlen, int l3index,
1118                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1119 {
1120         /* Add Key to the list */
1121         struct tcp_md5sig_key *key;
1122         struct tcp_sock *tp = tcp_sk(sk);
1123         struct tcp_md5sig_info *md5sig;
1124
1125         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1126         if (key) {
1127                 /* Pre-existing entry - just update that one.
1128                  * Note that the key might be used concurrently.
1129                  * data_race() is telling kcsan that we do not care of
1130                  * key mismatches, since changing MD5 key on live flows
1131                  * can lead to packet drops.
1132                  */
1133                 data_race(memcpy(key->key, newkey, newkeylen));
1134
1135                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1136                  * Also note that a reader could catch new key->keylen value
1137                  * but old key->key[], this is the reason we use __GFP_ZERO
1138                  * at sock_kmalloc() time below these lines.
1139                  */
1140                 WRITE_ONCE(key->keylen, newkeylen);
1141
1142                 return 0;
1143         }
1144
1145         md5sig = rcu_dereference_protected(tp->md5sig_info,
1146                                            lockdep_sock_is_held(sk));
1147         if (!md5sig) {
1148                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1149                 if (!md5sig)
1150                         return -ENOMEM;
1151
1152                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1153                 INIT_HLIST_HEAD(&md5sig->head);
1154                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1155         }
1156
1157         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1158         if (!key)
1159                 return -ENOMEM;
1160         if (!tcp_alloc_md5sig_pool()) {
1161                 sock_kfree_s(sk, key, sizeof(*key));
1162                 return -ENOMEM;
1163         }
1164
1165         memcpy(key->key, newkey, newkeylen);
1166         key->keylen = newkeylen;
1167         key->family = family;
1168         key->prefixlen = prefixlen;
1169         key->l3index = l3index;
1170         memcpy(&key->addr, addr,
1171                (family == AF_INET6) ? sizeof(struct in6_addr) :
1172                                       sizeof(struct in_addr));
1173         hlist_add_head_rcu(&key->node, &md5sig->head);
1174         return 0;
1175 }
1176 EXPORT_SYMBOL(tcp_md5_do_add);
1177
1178 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1179                    u8 prefixlen, int l3index)
1180 {
1181         struct tcp_md5sig_key *key;
1182
1183         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1184         if (!key)
1185                 return -ENOENT;
1186         hlist_del_rcu(&key->node);
1187         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1188         kfree_rcu(key, rcu);
1189         return 0;
1190 }
1191 EXPORT_SYMBOL(tcp_md5_do_del);
1192
1193 static void tcp_clear_md5_list(struct sock *sk)
1194 {
1195         struct tcp_sock *tp = tcp_sk(sk);
1196         struct tcp_md5sig_key *key;
1197         struct hlist_node *n;
1198         struct tcp_md5sig_info *md5sig;
1199
1200         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1201
1202         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1203                 hlist_del_rcu(&key->node);
1204                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1205                 kfree_rcu(key, rcu);
1206         }
1207 }
1208
1209 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1210                                  sockptr_t optval, int optlen)
1211 {
1212         struct tcp_md5sig cmd;
1213         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1214         const union tcp_md5_addr *addr;
1215         u8 prefixlen = 32;
1216         int l3index = 0;
1217
1218         if (optlen < sizeof(cmd))
1219                 return -EINVAL;
1220
1221         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1222                 return -EFAULT;
1223
1224         if (sin->sin_family != AF_INET)
1225                 return -EINVAL;
1226
1227         if (optname == TCP_MD5SIG_EXT &&
1228             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1229                 prefixlen = cmd.tcpm_prefixlen;
1230                 if (prefixlen > 32)
1231                         return -EINVAL;
1232         }
1233
1234         if (optname == TCP_MD5SIG_EXT &&
1235             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1236                 struct net_device *dev;
1237
1238                 rcu_read_lock();
1239                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1240                 if (dev && netif_is_l3_master(dev))
1241                         l3index = dev->ifindex;
1242
1243                 rcu_read_unlock();
1244
1245                 /* ok to reference set/not set outside of rcu;
1246                  * right now device MUST be an L3 master
1247                  */
1248                 if (!dev || !l3index)
1249                         return -EINVAL;
1250         }
1251
1252         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1253
1254         if (!cmd.tcpm_keylen)
1255                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1256
1257         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1258                 return -EINVAL;
1259
1260         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1261                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1262 }
1263
1264 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1265                                    __be32 daddr, __be32 saddr,
1266                                    const struct tcphdr *th, int nbytes)
1267 {
1268         struct tcp4_pseudohdr *bp;
1269         struct scatterlist sg;
1270         struct tcphdr *_th;
1271
1272         bp = hp->scratch;
1273         bp->saddr = saddr;
1274         bp->daddr = daddr;
1275         bp->pad = 0;
1276         bp->protocol = IPPROTO_TCP;
1277         bp->len = cpu_to_be16(nbytes);
1278
1279         _th = (struct tcphdr *)(bp + 1);
1280         memcpy(_th, th, sizeof(*th));
1281         _th->check = 0;
1282
1283         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1284         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1285                                 sizeof(*bp) + sizeof(*th));
1286         return crypto_ahash_update(hp->md5_req);
1287 }
1288
1289 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1290                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1291 {
1292         struct tcp_md5sig_pool *hp;
1293         struct ahash_request *req;
1294
1295         hp = tcp_get_md5sig_pool();
1296         if (!hp)
1297                 goto clear_hash_noput;
1298         req = hp->md5_req;
1299
1300         if (crypto_ahash_init(req))
1301                 goto clear_hash;
1302         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1303                 goto clear_hash;
1304         if (tcp_md5_hash_key(hp, key))
1305                 goto clear_hash;
1306         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1307         if (crypto_ahash_final(req))
1308                 goto clear_hash;
1309
1310         tcp_put_md5sig_pool();
1311         return 0;
1312
1313 clear_hash:
1314         tcp_put_md5sig_pool();
1315 clear_hash_noput:
1316         memset(md5_hash, 0, 16);
1317         return 1;
1318 }
1319
1320 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1321                         const struct sock *sk,
1322                         const struct sk_buff *skb)
1323 {
1324         struct tcp_md5sig_pool *hp;
1325         struct ahash_request *req;
1326         const struct tcphdr *th = tcp_hdr(skb);
1327         __be32 saddr, daddr;
1328
1329         if (sk) { /* valid for establish/request sockets */
1330                 saddr = sk->sk_rcv_saddr;
1331                 daddr = sk->sk_daddr;
1332         } else {
1333                 const struct iphdr *iph = ip_hdr(skb);
1334                 saddr = iph->saddr;
1335                 daddr = iph->daddr;
1336         }
1337
1338         hp = tcp_get_md5sig_pool();
1339         if (!hp)
1340                 goto clear_hash_noput;
1341         req = hp->md5_req;
1342
1343         if (crypto_ahash_init(req))
1344                 goto clear_hash;
1345
1346         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1347                 goto clear_hash;
1348         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1349                 goto clear_hash;
1350         if (tcp_md5_hash_key(hp, key))
1351                 goto clear_hash;
1352         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1353         if (crypto_ahash_final(req))
1354                 goto clear_hash;
1355
1356         tcp_put_md5sig_pool();
1357         return 0;
1358
1359 clear_hash:
1360         tcp_put_md5sig_pool();
1361 clear_hash_noput:
1362         memset(md5_hash, 0, 16);
1363         return 1;
1364 }
1365 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1366
1367 #endif
1368
1369 /* Called with rcu_read_lock() */
1370 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1371                                     const struct sk_buff *skb,
1372                                     int dif, int sdif)
1373 {
1374 #ifdef CONFIG_TCP_MD5SIG
1375         /*
1376          * This gets called for each TCP segment that arrives
1377          * so we want to be efficient.
1378          * We have 3 drop cases:
1379          * o No MD5 hash and one expected.
1380          * o MD5 hash and we're not expecting one.
1381          * o MD5 hash and its wrong.
1382          */
1383         const __u8 *hash_location = NULL;
1384         struct tcp_md5sig_key *hash_expected;
1385         const struct iphdr *iph = ip_hdr(skb);
1386         const struct tcphdr *th = tcp_hdr(skb);
1387         const union tcp_md5_addr *addr;
1388         unsigned char newhash[16];
1389         int genhash, l3index;
1390
1391         /* sdif set, means packet ingressed via a device
1392          * in an L3 domain and dif is set to the l3mdev
1393          */
1394         l3index = sdif ? dif : 0;
1395
1396         addr = (union tcp_md5_addr *)&iph->saddr;
1397         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1398         hash_location = tcp_parse_md5sig_option(th);
1399
1400         /* We've parsed the options - do we have a hash? */
1401         if (!hash_expected && !hash_location)
1402                 return false;
1403
1404         if (hash_expected && !hash_location) {
1405                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1406                 return true;
1407         }
1408
1409         if (!hash_expected && hash_location) {
1410                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1411                 return true;
1412         }
1413
1414         /* Okay, so this is hash_expected and hash_location -
1415          * so we need to calculate the checksum.
1416          */
1417         genhash = tcp_v4_md5_hash_skb(newhash,
1418                                       hash_expected,
1419                                       NULL, skb);
1420
1421         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1422                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1423                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1424                                      &iph->saddr, ntohs(th->source),
1425                                      &iph->daddr, ntohs(th->dest),
1426                                      genhash ? " tcp_v4_calc_md5_hash failed"
1427                                      : "", l3index);
1428                 return true;
1429         }
1430         return false;
1431 #endif
1432         return false;
1433 }
1434
1435 static void tcp_v4_init_req(struct request_sock *req,
1436                             const struct sock *sk_listener,
1437                             struct sk_buff *skb)
1438 {
1439         struct inet_request_sock *ireq = inet_rsk(req);
1440         struct net *net = sock_net(sk_listener);
1441
1442         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1443         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1444         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1445 }
1446
1447 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1448                                           struct sk_buff *skb,
1449                                           struct flowi *fl,
1450                                           struct request_sock *req)
1451 {
1452         tcp_v4_init_req(req, sk, skb);
1453
1454         if (security_inet_conn_request(sk, skb, req))
1455                 return NULL;
1456
1457         return inet_csk_route_req(sk, &fl->u.ip4, req);
1458 }
1459
1460 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1461         .family         =       PF_INET,
1462         .obj_size       =       sizeof(struct tcp_request_sock),
1463         .rtx_syn_ack    =       tcp_rtx_synack,
1464         .send_ack       =       tcp_v4_reqsk_send_ack,
1465         .destructor     =       tcp_v4_reqsk_destructor,
1466         .send_reset     =       tcp_v4_send_reset,
1467         .syn_ack_timeout =      tcp_syn_ack_timeout,
1468 };
1469
1470 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1471         .mss_clamp      =       TCP_MSS_DEFAULT,
1472 #ifdef CONFIG_TCP_MD5SIG
1473         .req_md5_lookup =       tcp_v4_md5_lookup,
1474         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1475 #endif
1476 #ifdef CONFIG_SYN_COOKIES
1477         .cookie_init_seq =      cookie_v4_init_sequence,
1478 #endif
1479         .route_req      =       tcp_v4_route_req,
1480         .init_seq       =       tcp_v4_init_seq,
1481         .init_ts_off    =       tcp_v4_init_ts_off,
1482         .send_synack    =       tcp_v4_send_synack,
1483 };
1484
1485 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1486 {
1487         /* Never answer to SYNs send to broadcast or multicast */
1488         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489                 goto drop;
1490
1491         return tcp_conn_request(&tcp_request_sock_ops,
1492                                 &tcp_request_sock_ipv4_ops, sk, skb);
1493
1494 drop:
1495         tcp_listendrop(sk);
1496         return 0;
1497 }
1498 EXPORT_SYMBOL(tcp_v4_conn_request);
1499
1500
1501 /*
1502  * The three way handshake has completed - we got a valid synack -
1503  * now create the new socket.
1504  */
1505 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1506                                   struct request_sock *req,
1507                                   struct dst_entry *dst,
1508                                   struct request_sock *req_unhash,
1509                                   bool *own_req)
1510 {
1511         struct inet_request_sock *ireq;
1512         bool found_dup_sk = false;
1513         struct inet_sock *newinet;
1514         struct tcp_sock *newtp;
1515         struct sock *newsk;
1516 #ifdef CONFIG_TCP_MD5SIG
1517         const union tcp_md5_addr *addr;
1518         struct tcp_md5sig_key *key;
1519         int l3index;
1520 #endif
1521         struct ip_options_rcu *inet_opt;
1522
1523         if (sk_acceptq_is_full(sk))
1524                 goto exit_overflow;
1525
1526         newsk = tcp_create_openreq_child(sk, req, skb);
1527         if (!newsk)
1528                 goto exit_nonewsk;
1529
1530         newsk->sk_gso_type = SKB_GSO_TCPV4;
1531         inet_sk_rx_dst_set(newsk, skb);
1532
1533         newtp                 = tcp_sk(newsk);
1534         newinet               = inet_sk(newsk);
1535         ireq                  = inet_rsk(req);
1536         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1537         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1538         newsk->sk_bound_dev_if = ireq->ir_iif;
1539         newinet->inet_saddr   = ireq->ir_loc_addr;
1540         inet_opt              = rcu_dereference(ireq->ireq_opt);
1541         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1542         newinet->mc_index     = inet_iif(skb);
1543         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1544         newinet->rcv_tos      = ip_hdr(skb)->tos;
1545         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1546         if (inet_opt)
1547                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1548         newinet->inet_id = prandom_u32();
1549
1550         /* Set ToS of the new socket based upon the value of incoming SYN.
1551          * ECT bits are set later in tcp_init_transfer().
1552          */
1553         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1554                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1555
1556         if (!dst) {
1557                 dst = inet_csk_route_child_sock(sk, newsk, req);
1558                 if (!dst)
1559                         goto put_and_exit;
1560         } else {
1561                 /* syncookie case : see end of cookie_v4_check() */
1562         }
1563         sk_setup_caps(newsk, dst);
1564
1565         tcp_ca_openreq_child(newsk, dst);
1566
1567         tcp_sync_mss(newsk, dst_mtu(dst));
1568         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1569
1570         tcp_initialize_rcv_mss(newsk);
1571
1572 #ifdef CONFIG_TCP_MD5SIG
1573         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1574         /* Copy over the MD5 key from the original socket */
1575         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1576         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1577         if (key) {
1578                 /*
1579                  * We're using one, so create a matching key
1580                  * on the newsk structure. If we fail to get
1581                  * memory, then we end up not copying the key
1582                  * across. Shucks.
1583                  */
1584                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1585                                key->key, key->keylen, GFP_ATOMIC);
1586                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1587         }
1588 #endif
1589
1590         if (__inet_inherit_port(sk, newsk) < 0)
1591                 goto put_and_exit;
1592         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1593                                        &found_dup_sk);
1594         if (likely(*own_req)) {
1595                 tcp_move_syn(newtp, req);
1596                 ireq->ireq_opt = NULL;
1597         } else {
1598                 newinet->inet_opt = NULL;
1599
1600                 if (!req_unhash && found_dup_sk) {
1601                         /* This code path should only be executed in the
1602                          * syncookie case only
1603                          */
1604                         bh_unlock_sock(newsk);
1605                         sock_put(newsk);
1606                         newsk = NULL;
1607                 }
1608         }
1609         return newsk;
1610
1611 exit_overflow:
1612         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1613 exit_nonewsk:
1614         dst_release(dst);
1615 exit:
1616         tcp_listendrop(sk);
1617         return NULL;
1618 put_and_exit:
1619         newinet->inet_opt = NULL;
1620         inet_csk_prepare_forced_close(newsk);
1621         tcp_done(newsk);
1622         goto exit;
1623 }
1624 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1625
1626 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1627 {
1628 #ifdef CONFIG_SYN_COOKIES
1629         const struct tcphdr *th = tcp_hdr(skb);
1630
1631         if (!th->syn)
1632                 sk = cookie_v4_check(sk, skb);
1633 #endif
1634         return sk;
1635 }
1636
1637 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1638                          struct tcphdr *th, u32 *cookie)
1639 {
1640         u16 mss = 0;
1641 #ifdef CONFIG_SYN_COOKIES
1642         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1643                                     &tcp_request_sock_ipv4_ops, sk, th);
1644         if (mss) {
1645                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1646                 tcp_synq_overflow(sk);
1647         }
1648 #endif
1649         return mss;
1650 }
1651
1652 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1653                                                            u32));
1654 /* The socket must have it's spinlock held when we get
1655  * here, unless it is a TCP_LISTEN socket.
1656  *
1657  * We have a potential double-lock case here, so even when
1658  * doing backlog processing we use the BH locking scheme.
1659  * This is because we cannot sleep with the original spinlock
1660  * held.
1661  */
1662 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1663 {
1664         struct sock *rsk;
1665
1666         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1667                 struct dst_entry *dst = sk->sk_rx_dst;
1668
1669                 sock_rps_save_rxhash(sk, skb);
1670                 sk_mark_napi_id(sk, skb);
1671                 if (dst) {
1672                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1673                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1674                                              dst, 0)) {
1675                                 dst_release(dst);
1676                                 sk->sk_rx_dst = NULL;
1677                         }
1678                 }
1679                 tcp_rcv_established(sk, skb);
1680                 return 0;
1681         }
1682
1683         if (tcp_checksum_complete(skb))
1684                 goto csum_err;
1685
1686         if (sk->sk_state == TCP_LISTEN) {
1687                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1688
1689                 if (!nsk)
1690                         goto discard;
1691                 if (nsk != sk) {
1692                         if (tcp_child_process(sk, nsk, skb)) {
1693                                 rsk = nsk;
1694                                 goto reset;
1695                         }
1696                         return 0;
1697                 }
1698         } else
1699                 sock_rps_save_rxhash(sk, skb);
1700
1701         if (tcp_rcv_state_process(sk, skb)) {
1702                 rsk = sk;
1703                 goto reset;
1704         }
1705         return 0;
1706
1707 reset:
1708         tcp_v4_send_reset(rsk, skb);
1709 discard:
1710         kfree_skb(skb);
1711         /* Be careful here. If this function gets more complicated and
1712          * gcc suffers from register pressure on the x86, sk (in %ebx)
1713          * might be destroyed here. This current version compiles correctly,
1714          * but you have been warned.
1715          */
1716         return 0;
1717
1718 csum_err:
1719         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1720         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1721         goto discard;
1722 }
1723 EXPORT_SYMBOL(tcp_v4_do_rcv);
1724
1725 int tcp_v4_early_demux(struct sk_buff *skb)
1726 {
1727         const struct iphdr *iph;
1728         const struct tcphdr *th;
1729         struct sock *sk;
1730
1731         if (skb->pkt_type != PACKET_HOST)
1732                 return 0;
1733
1734         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1735                 return 0;
1736
1737         iph = ip_hdr(skb);
1738         th = tcp_hdr(skb);
1739
1740         if (th->doff < sizeof(struct tcphdr) / 4)
1741                 return 0;
1742
1743         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1744                                        iph->saddr, th->source,
1745                                        iph->daddr, ntohs(th->dest),
1746                                        skb->skb_iif, inet_sdif(skb));
1747         if (sk) {
1748                 skb->sk = sk;
1749                 skb->destructor = sock_edemux;
1750                 if (sk_fullsock(sk)) {
1751                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1752
1753                         if (dst)
1754                                 dst = dst_check(dst, 0);
1755                         if (dst &&
1756                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1757                                 skb_dst_set_noref(skb, dst);
1758                 }
1759         }
1760         return 0;
1761 }
1762
1763 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1764 {
1765         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1766         u32 tail_gso_size, tail_gso_segs;
1767         struct skb_shared_info *shinfo;
1768         const struct tcphdr *th;
1769         struct tcphdr *thtail;
1770         struct sk_buff *tail;
1771         unsigned int hdrlen;
1772         bool fragstolen;
1773         u32 gso_segs;
1774         u32 gso_size;
1775         int delta;
1776
1777         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1778          * we can fix skb->truesize to its real value to avoid future drops.
1779          * This is valid because skb is not yet charged to the socket.
1780          * It has been noticed pure SACK packets were sometimes dropped
1781          * (if cooked by drivers without copybreak feature).
1782          */
1783         skb_condense(skb);
1784
1785         skb_dst_drop(skb);
1786
1787         if (unlikely(tcp_checksum_complete(skb))) {
1788                 bh_unlock_sock(sk);
1789                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1790                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1791                 return true;
1792         }
1793
1794         /* Attempt coalescing to last skb in backlog, even if we are
1795          * above the limits.
1796          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1797          */
1798         th = (const struct tcphdr *)skb->data;
1799         hdrlen = th->doff * 4;
1800
1801         tail = sk->sk_backlog.tail;
1802         if (!tail)
1803                 goto no_coalesce;
1804         thtail = (struct tcphdr *)tail->data;
1805
1806         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1807             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1808             ((TCP_SKB_CB(tail)->tcp_flags |
1809               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1810             !((TCP_SKB_CB(tail)->tcp_flags &
1811               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1812             ((TCP_SKB_CB(tail)->tcp_flags ^
1813               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1814 #ifdef CONFIG_TLS_DEVICE
1815             tail->decrypted != skb->decrypted ||
1816 #endif
1817             thtail->doff != th->doff ||
1818             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1819                 goto no_coalesce;
1820
1821         __skb_pull(skb, hdrlen);
1822
1823         shinfo = skb_shinfo(skb);
1824         gso_size = shinfo->gso_size ?: skb->len;
1825         gso_segs = shinfo->gso_segs ?: 1;
1826
1827         shinfo = skb_shinfo(tail);
1828         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1829         tail_gso_segs = shinfo->gso_segs ?: 1;
1830
1831         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1832                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1833
1834                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1835                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1836                         thtail->window = th->window;
1837                 }
1838
1839                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1840                  * thtail->fin, so that the fast path in tcp_rcv_established()
1841                  * is not entered if we append a packet with a FIN.
1842                  * SYN, RST, URG are not present.
1843                  * ACK is set on both packets.
1844                  * PSH : we do not really care in TCP stack,
1845                  *       at least for 'GRO' packets.
1846                  */
1847                 thtail->fin |= th->fin;
1848                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1849
1850                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1851                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1852                         tail->tstamp = skb->tstamp;
1853                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1854                 }
1855
1856                 /* Not as strict as GRO. We only need to carry mss max value */
1857                 shinfo->gso_size = max(gso_size, tail_gso_size);
1858                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1859
1860                 sk->sk_backlog.len += delta;
1861                 __NET_INC_STATS(sock_net(sk),
1862                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1863                 kfree_skb_partial(skb, fragstolen);
1864                 return false;
1865         }
1866         __skb_push(skb, hdrlen);
1867
1868 no_coalesce:
1869         /* Only socket owner can try to collapse/prune rx queues
1870          * to reduce memory overhead, so add a little headroom here.
1871          * Few sockets backlog are possibly concurrently non empty.
1872          */
1873         limit += 64*1024;
1874
1875         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1876                 bh_unlock_sock(sk);
1877                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1878                 return true;
1879         }
1880         return false;
1881 }
1882 EXPORT_SYMBOL(tcp_add_backlog);
1883
1884 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1885 {
1886         struct tcphdr *th = (struct tcphdr *)skb->data;
1887
1888         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1889 }
1890 EXPORT_SYMBOL(tcp_filter);
1891
1892 static void tcp_v4_restore_cb(struct sk_buff *skb)
1893 {
1894         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1895                 sizeof(struct inet_skb_parm));
1896 }
1897
1898 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1899                            const struct tcphdr *th)
1900 {
1901         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1902          * barrier() makes sure compiler wont play fool^Waliasing games.
1903          */
1904         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1905                 sizeof(struct inet_skb_parm));
1906         barrier();
1907
1908         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1909         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1910                                     skb->len - th->doff * 4);
1911         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1912         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1913         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1914         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1915         TCP_SKB_CB(skb)->sacked  = 0;
1916         TCP_SKB_CB(skb)->has_rxtstamp =
1917                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1918 }
1919
1920 /*
1921  *      From tcp_input.c
1922  */
1923
1924 int tcp_v4_rcv(struct sk_buff *skb)
1925 {
1926         struct net *net = dev_net(skb->dev);
1927         struct sk_buff *skb_to_free;
1928         int sdif = inet_sdif(skb);
1929         int dif = inet_iif(skb);
1930         const struct iphdr *iph;
1931         const struct tcphdr *th;
1932         bool refcounted;
1933         struct sock *sk;
1934         int ret;
1935
1936         if (skb->pkt_type != PACKET_HOST)
1937                 goto discard_it;
1938
1939         /* Count it even if it's bad */
1940         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1941
1942         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1943                 goto discard_it;
1944
1945         th = (const struct tcphdr *)skb->data;
1946
1947         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1948                 goto bad_packet;
1949         if (!pskb_may_pull(skb, th->doff * 4))
1950                 goto discard_it;
1951
1952         /* An explanation is required here, I think.
1953          * Packet length and doff are validated by header prediction,
1954          * provided case of th->doff==0 is eliminated.
1955          * So, we defer the checks. */
1956
1957         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1958                 goto csum_error;
1959
1960         th = (const struct tcphdr *)skb->data;
1961         iph = ip_hdr(skb);
1962 lookup:
1963         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1964                                th->dest, sdif, &refcounted);
1965         if (!sk)
1966                 goto no_tcp_socket;
1967
1968 process:
1969         if (sk->sk_state == TCP_TIME_WAIT)
1970                 goto do_time_wait;
1971
1972         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1973                 struct request_sock *req = inet_reqsk(sk);
1974                 bool req_stolen = false;
1975                 struct sock *nsk;
1976
1977                 sk = req->rsk_listener;
1978                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1979                         sk_drops_add(sk, skb);
1980                         reqsk_put(req);
1981                         goto discard_it;
1982                 }
1983                 if (tcp_checksum_complete(skb)) {
1984                         reqsk_put(req);
1985                         goto csum_error;
1986                 }
1987                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1988                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1989                         goto lookup;
1990                 }
1991                 /* We own a reference on the listener, increase it again
1992                  * as we might lose it too soon.
1993                  */
1994                 sock_hold(sk);
1995                 refcounted = true;
1996                 nsk = NULL;
1997                 if (!tcp_filter(sk, skb)) {
1998                         th = (const struct tcphdr *)skb->data;
1999                         iph = ip_hdr(skb);
2000                         tcp_v4_fill_cb(skb, iph, th);
2001                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2002                 }
2003                 if (!nsk) {
2004                         reqsk_put(req);
2005                         if (req_stolen) {
2006                                 /* Another cpu got exclusive access to req
2007                                  * and created a full blown socket.
2008                                  * Try to feed this packet to this socket
2009                                  * instead of discarding it.
2010                                  */
2011                                 tcp_v4_restore_cb(skb);
2012                                 sock_put(sk);
2013                                 goto lookup;
2014                         }
2015                         goto discard_and_relse;
2016                 }
2017                 if (nsk == sk) {
2018                         reqsk_put(req);
2019                         tcp_v4_restore_cb(skb);
2020                 } else if (tcp_child_process(sk, nsk, skb)) {
2021                         tcp_v4_send_reset(nsk, skb);
2022                         goto discard_and_relse;
2023                 } else {
2024                         sock_put(sk);
2025                         return 0;
2026                 }
2027         }
2028         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2029                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2030                 goto discard_and_relse;
2031         }
2032
2033         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2034                 goto discard_and_relse;
2035
2036         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2037                 goto discard_and_relse;
2038
2039         nf_reset_ct(skb);
2040
2041         if (tcp_filter(sk, skb))
2042                 goto discard_and_relse;
2043         th = (const struct tcphdr *)skb->data;
2044         iph = ip_hdr(skb);
2045         tcp_v4_fill_cb(skb, iph, th);
2046
2047         skb->dev = NULL;
2048
2049         if (sk->sk_state == TCP_LISTEN) {
2050                 ret = tcp_v4_do_rcv(sk, skb);
2051                 goto put_and_return;
2052         }
2053
2054         sk_incoming_cpu_update(sk);
2055
2056         bh_lock_sock_nested(sk);
2057         tcp_segs_in(tcp_sk(sk), skb);
2058         ret = 0;
2059         if (!sock_owned_by_user(sk)) {
2060                 skb_to_free = sk->sk_rx_skb_cache;
2061                 sk->sk_rx_skb_cache = NULL;
2062                 ret = tcp_v4_do_rcv(sk, skb);
2063         } else {
2064                 if (tcp_add_backlog(sk, skb))
2065                         goto discard_and_relse;
2066                 skb_to_free = NULL;
2067         }
2068         bh_unlock_sock(sk);
2069         if (skb_to_free)
2070                 __kfree_skb(skb_to_free);
2071
2072 put_and_return:
2073         if (refcounted)
2074                 sock_put(sk);
2075
2076         return ret;
2077
2078 no_tcp_socket:
2079         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2080                 goto discard_it;
2081
2082         tcp_v4_fill_cb(skb, iph, th);
2083
2084         if (tcp_checksum_complete(skb)) {
2085 csum_error:
2086                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2087 bad_packet:
2088                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2089         } else {
2090                 tcp_v4_send_reset(NULL, skb);
2091         }
2092
2093 discard_it:
2094         /* Discard frame. */
2095         kfree_skb(skb);
2096         return 0;
2097
2098 discard_and_relse:
2099         sk_drops_add(sk, skb);
2100         if (refcounted)
2101                 sock_put(sk);
2102         goto discard_it;
2103
2104 do_time_wait:
2105         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2106                 inet_twsk_put(inet_twsk(sk));
2107                 goto discard_it;
2108         }
2109
2110         tcp_v4_fill_cb(skb, iph, th);
2111
2112         if (tcp_checksum_complete(skb)) {
2113                 inet_twsk_put(inet_twsk(sk));
2114                 goto csum_error;
2115         }
2116         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2117         case TCP_TW_SYN: {
2118                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2119                                                         &tcp_hashinfo, skb,
2120                                                         __tcp_hdrlen(th),
2121                                                         iph->saddr, th->source,
2122                                                         iph->daddr, th->dest,
2123                                                         inet_iif(skb),
2124                                                         sdif);
2125                 if (sk2) {
2126                         inet_twsk_deschedule_put(inet_twsk(sk));
2127                         sk = sk2;
2128                         tcp_v4_restore_cb(skb);
2129                         refcounted = false;
2130                         goto process;
2131                 }
2132         }
2133                 /* to ACK */
2134                 fallthrough;
2135         case TCP_TW_ACK:
2136                 tcp_v4_timewait_ack(sk, skb);
2137                 break;
2138         case TCP_TW_RST:
2139                 tcp_v4_send_reset(sk, skb);
2140                 inet_twsk_deschedule_put(inet_twsk(sk));
2141                 goto discard_it;
2142         case TCP_TW_SUCCESS:;
2143         }
2144         goto discard_it;
2145 }
2146
2147 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2148         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2149         .twsk_unique    = tcp_twsk_unique,
2150         .twsk_destructor= tcp_twsk_destructor,
2151 };
2152
2153 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2154 {
2155         struct dst_entry *dst = skb_dst(skb);
2156
2157         if (dst && dst_hold_safe(dst)) {
2158                 sk->sk_rx_dst = dst;
2159                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2160         }
2161 }
2162 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2163
2164 const struct inet_connection_sock_af_ops ipv4_specific = {
2165         .queue_xmit        = ip_queue_xmit,
2166         .send_check        = tcp_v4_send_check,
2167         .rebuild_header    = inet_sk_rebuild_header,
2168         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2169         .conn_request      = tcp_v4_conn_request,
2170         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2171         .net_header_len    = sizeof(struct iphdr),
2172         .setsockopt        = ip_setsockopt,
2173         .getsockopt        = ip_getsockopt,
2174         .addr2sockaddr     = inet_csk_addr2sockaddr,
2175         .sockaddr_len      = sizeof(struct sockaddr_in),
2176         .mtu_reduced       = tcp_v4_mtu_reduced,
2177 };
2178 EXPORT_SYMBOL(ipv4_specific);
2179
2180 #ifdef CONFIG_TCP_MD5SIG
2181 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2182         .md5_lookup             = tcp_v4_md5_lookup,
2183         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2184         .md5_parse              = tcp_v4_parse_md5_keys,
2185 };
2186 #endif
2187
2188 /* NOTE: A lot of things set to zero explicitly by call to
2189  *       sk_alloc() so need not be done here.
2190  */
2191 static int tcp_v4_init_sock(struct sock *sk)
2192 {
2193         struct inet_connection_sock *icsk = inet_csk(sk);
2194
2195         tcp_init_sock(sk);
2196
2197         icsk->icsk_af_ops = &ipv4_specific;
2198
2199 #ifdef CONFIG_TCP_MD5SIG
2200         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2201 #endif
2202
2203         return 0;
2204 }
2205
2206 void tcp_v4_destroy_sock(struct sock *sk)
2207 {
2208         struct tcp_sock *tp = tcp_sk(sk);
2209
2210         trace_tcp_destroy_sock(sk);
2211
2212         tcp_clear_xmit_timers(sk);
2213
2214         tcp_cleanup_congestion_control(sk);
2215
2216         tcp_cleanup_ulp(sk);
2217
2218         /* Cleanup up the write buffer. */
2219         tcp_write_queue_purge(sk);
2220
2221         /* Check if we want to disable active TFO */
2222         tcp_fastopen_active_disable_ofo_check(sk);
2223
2224         /* Cleans up our, hopefully empty, out_of_order_queue. */
2225         skb_rbtree_purge(&tp->out_of_order_queue);
2226
2227 #ifdef CONFIG_TCP_MD5SIG
2228         /* Clean up the MD5 key list, if any */
2229         if (tp->md5sig_info) {
2230                 tcp_clear_md5_list(sk);
2231                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2232                 tp->md5sig_info = NULL;
2233         }
2234 #endif
2235
2236         /* Clean up a referenced TCP bind bucket. */
2237         if (inet_csk(sk)->icsk_bind_hash)
2238                 inet_put_port(sk);
2239
2240         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2241
2242         /* If socket is aborted during connect operation */
2243         tcp_free_fastopen_req(tp);
2244         tcp_fastopen_destroy_cipher(sk);
2245         tcp_saved_syn_free(tp);
2246
2247         sk_sockets_allocated_dec(sk);
2248 }
2249 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2250
2251 #ifdef CONFIG_PROC_FS
2252 /* Proc filesystem TCP sock list dumping. */
2253
2254 /*
2255  * Get next listener socket follow cur.  If cur is NULL, get first socket
2256  * starting from bucket given in st->bucket; when st->bucket is zero the
2257  * very first socket in the hash table is returned.
2258  */
2259 static void *listening_get_next(struct seq_file *seq, void *cur)
2260 {
2261         struct tcp_seq_afinfo *afinfo;
2262         struct tcp_iter_state *st = seq->private;
2263         struct net *net = seq_file_net(seq);
2264         struct inet_listen_hashbucket *ilb;
2265         struct hlist_nulls_node *node;
2266         struct sock *sk = cur;
2267
2268         if (st->bpf_seq_afinfo)
2269                 afinfo = st->bpf_seq_afinfo;
2270         else
2271                 afinfo = PDE_DATA(file_inode(seq->file));
2272
2273         if (!sk) {
2274 get_head:
2275                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2276                 spin_lock(&ilb->lock);
2277                 sk = sk_nulls_head(&ilb->nulls_head);
2278                 st->offset = 0;
2279                 goto get_sk;
2280         }
2281         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2282         ++st->num;
2283         ++st->offset;
2284
2285         sk = sk_nulls_next(sk);
2286 get_sk:
2287         sk_nulls_for_each_from(sk, node) {
2288                 if (!net_eq(sock_net(sk), net))
2289                         continue;
2290                 if (afinfo->family == AF_UNSPEC ||
2291                     sk->sk_family == afinfo->family)
2292                         return sk;
2293         }
2294         spin_unlock(&ilb->lock);
2295         st->offset = 0;
2296         if (++st->bucket < INET_LHTABLE_SIZE)
2297                 goto get_head;
2298         return NULL;
2299 }
2300
2301 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2302 {
2303         struct tcp_iter_state *st = seq->private;
2304         void *rc;
2305
2306         st->bucket = 0;
2307         st->offset = 0;
2308         rc = listening_get_next(seq, NULL);
2309
2310         while (rc && *pos) {
2311                 rc = listening_get_next(seq, rc);
2312                 --*pos;
2313         }
2314         return rc;
2315 }
2316
2317 static inline bool empty_bucket(const struct tcp_iter_state *st)
2318 {
2319         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2320 }
2321
2322 /*
2323  * Get first established socket starting from bucket given in st->bucket.
2324  * If st->bucket is zero, the very first socket in the hash is returned.
2325  */
2326 static void *established_get_first(struct seq_file *seq)
2327 {
2328         struct tcp_seq_afinfo *afinfo;
2329         struct tcp_iter_state *st = seq->private;
2330         struct net *net = seq_file_net(seq);
2331         void *rc = NULL;
2332
2333         if (st->bpf_seq_afinfo)
2334                 afinfo = st->bpf_seq_afinfo;
2335         else
2336                 afinfo = PDE_DATA(file_inode(seq->file));
2337
2338         st->offset = 0;
2339         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2340                 struct sock *sk;
2341                 struct hlist_nulls_node *node;
2342                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2343
2344                 /* Lockless fast path for the common case of empty buckets */
2345                 if (empty_bucket(st))
2346                         continue;
2347
2348                 spin_lock_bh(lock);
2349                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2350                         if ((afinfo->family != AF_UNSPEC &&
2351                              sk->sk_family != afinfo->family) ||
2352                             !net_eq(sock_net(sk), net)) {
2353                                 continue;
2354                         }
2355                         rc = sk;
2356                         goto out;
2357                 }
2358                 spin_unlock_bh(lock);
2359         }
2360 out:
2361         return rc;
2362 }
2363
2364 static void *established_get_next(struct seq_file *seq, void *cur)
2365 {
2366         struct tcp_seq_afinfo *afinfo;
2367         struct sock *sk = cur;
2368         struct hlist_nulls_node *node;
2369         struct tcp_iter_state *st = seq->private;
2370         struct net *net = seq_file_net(seq);
2371
2372         if (st->bpf_seq_afinfo)
2373                 afinfo = st->bpf_seq_afinfo;
2374         else
2375                 afinfo = PDE_DATA(file_inode(seq->file));
2376
2377         ++st->num;
2378         ++st->offset;
2379
2380         sk = sk_nulls_next(sk);
2381
2382         sk_nulls_for_each_from(sk, node) {
2383                 if ((afinfo->family == AF_UNSPEC ||
2384                      sk->sk_family == afinfo->family) &&
2385                     net_eq(sock_net(sk), net))
2386                         return sk;
2387         }
2388
2389         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2390         ++st->bucket;
2391         return established_get_first(seq);
2392 }
2393
2394 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2395 {
2396         struct tcp_iter_state *st = seq->private;
2397         void *rc;
2398
2399         st->bucket = 0;
2400         rc = established_get_first(seq);
2401
2402         while (rc && pos) {
2403                 rc = established_get_next(seq, rc);
2404                 --pos;
2405         }
2406         return rc;
2407 }
2408
2409 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2410 {
2411         void *rc;
2412         struct tcp_iter_state *st = seq->private;
2413
2414         st->state = TCP_SEQ_STATE_LISTENING;
2415         rc        = listening_get_idx(seq, &pos);
2416
2417         if (!rc) {
2418                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2419                 rc        = established_get_idx(seq, pos);
2420         }
2421
2422         return rc;
2423 }
2424
2425 static void *tcp_seek_last_pos(struct seq_file *seq)
2426 {
2427         struct tcp_iter_state *st = seq->private;
2428         int offset = st->offset;
2429         int orig_num = st->num;
2430         void *rc = NULL;
2431
2432         switch (st->state) {
2433         case TCP_SEQ_STATE_LISTENING:
2434                 if (st->bucket >= INET_LHTABLE_SIZE)
2435                         break;
2436                 st->state = TCP_SEQ_STATE_LISTENING;
2437                 rc = listening_get_next(seq, NULL);
2438                 while (offset-- && rc)
2439                         rc = listening_get_next(seq, rc);
2440                 if (rc)
2441                         break;
2442                 st->bucket = 0;
2443                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2444                 fallthrough;
2445         case TCP_SEQ_STATE_ESTABLISHED:
2446                 if (st->bucket > tcp_hashinfo.ehash_mask)
2447                         break;
2448                 rc = established_get_first(seq);
2449                 while (offset-- && rc)
2450                         rc = established_get_next(seq, rc);
2451         }
2452
2453         st->num = orig_num;
2454
2455         return rc;
2456 }
2457
2458 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2459 {
2460         struct tcp_iter_state *st = seq->private;
2461         void *rc;
2462
2463         if (*pos && *pos == st->last_pos) {
2464                 rc = tcp_seek_last_pos(seq);
2465                 if (rc)
2466                         goto out;
2467         }
2468
2469         st->state = TCP_SEQ_STATE_LISTENING;
2470         st->num = 0;
2471         st->bucket = 0;
2472         st->offset = 0;
2473         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2474
2475 out:
2476         st->last_pos = *pos;
2477         return rc;
2478 }
2479 EXPORT_SYMBOL(tcp_seq_start);
2480
2481 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2482 {
2483         struct tcp_iter_state *st = seq->private;
2484         void *rc = NULL;
2485
2486         if (v == SEQ_START_TOKEN) {
2487                 rc = tcp_get_idx(seq, 0);
2488                 goto out;
2489         }
2490
2491         switch (st->state) {
2492         case TCP_SEQ_STATE_LISTENING:
2493                 rc = listening_get_next(seq, v);
2494                 if (!rc) {
2495                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2496                         st->bucket = 0;
2497                         st->offset = 0;
2498                         rc        = established_get_first(seq);
2499                 }
2500                 break;
2501         case TCP_SEQ_STATE_ESTABLISHED:
2502                 rc = established_get_next(seq, v);
2503                 break;
2504         }
2505 out:
2506         ++*pos;
2507         st->last_pos = *pos;
2508         return rc;
2509 }
2510 EXPORT_SYMBOL(tcp_seq_next);
2511
2512 void tcp_seq_stop(struct seq_file *seq, void *v)
2513 {
2514         struct tcp_iter_state *st = seq->private;
2515
2516         switch (st->state) {
2517         case TCP_SEQ_STATE_LISTENING:
2518                 if (v != SEQ_START_TOKEN)
2519                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2520                 break;
2521         case TCP_SEQ_STATE_ESTABLISHED:
2522                 if (v)
2523                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2524                 break;
2525         }
2526 }
2527 EXPORT_SYMBOL(tcp_seq_stop);
2528
2529 static void get_openreq4(const struct request_sock *req,
2530                          struct seq_file *f, int i)
2531 {
2532         const struct inet_request_sock *ireq = inet_rsk(req);
2533         long delta = req->rsk_timer.expires - jiffies;
2534
2535         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2536                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2537                 i,
2538                 ireq->ir_loc_addr,
2539                 ireq->ir_num,
2540                 ireq->ir_rmt_addr,
2541                 ntohs(ireq->ir_rmt_port),
2542                 TCP_SYN_RECV,
2543                 0, 0, /* could print option size, but that is af dependent. */
2544                 1,    /* timers active (only the expire timer) */
2545                 jiffies_delta_to_clock_t(delta),
2546                 req->num_timeout,
2547                 from_kuid_munged(seq_user_ns(f),
2548                                  sock_i_uid(req->rsk_listener)),
2549                 0,  /* non standard timer */
2550                 0, /* open_requests have no inode */
2551                 0,
2552                 req);
2553 }
2554
2555 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2556 {
2557         int timer_active;
2558         unsigned long timer_expires;
2559         const struct tcp_sock *tp = tcp_sk(sk);
2560         const struct inet_connection_sock *icsk = inet_csk(sk);
2561         const struct inet_sock *inet = inet_sk(sk);
2562         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2563         __be32 dest = inet->inet_daddr;
2564         __be32 src = inet->inet_rcv_saddr;
2565         __u16 destp = ntohs(inet->inet_dport);
2566         __u16 srcp = ntohs(inet->inet_sport);
2567         int rx_queue;
2568         int state;
2569
2570         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2571             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2572             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2573                 timer_active    = 1;
2574                 timer_expires   = icsk->icsk_timeout;
2575         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2576                 timer_active    = 4;
2577                 timer_expires   = icsk->icsk_timeout;
2578         } else if (timer_pending(&sk->sk_timer)) {
2579                 timer_active    = 2;
2580                 timer_expires   = sk->sk_timer.expires;
2581         } else {
2582                 timer_active    = 0;
2583                 timer_expires = jiffies;
2584         }
2585
2586         state = inet_sk_state_load(sk);
2587         if (state == TCP_LISTEN)
2588                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2589         else
2590                 /* Because we don't lock the socket,
2591                  * we might find a transient negative value.
2592                  */
2593                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2594                                       READ_ONCE(tp->copied_seq), 0);
2595
2596         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2597                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2598                 i, src, srcp, dest, destp, state,
2599                 READ_ONCE(tp->write_seq) - tp->snd_una,
2600                 rx_queue,
2601                 timer_active,
2602                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2603                 icsk->icsk_retransmits,
2604                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2605                 icsk->icsk_probes_out,
2606                 sock_i_ino(sk),
2607                 refcount_read(&sk->sk_refcnt), sk,
2608                 jiffies_to_clock_t(icsk->icsk_rto),
2609                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2610                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2611                 tp->snd_cwnd,
2612                 state == TCP_LISTEN ?
2613                     fastopenq->max_qlen :
2614                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2615 }
2616
2617 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2618                                struct seq_file *f, int i)
2619 {
2620         long delta = tw->tw_timer.expires - jiffies;
2621         __be32 dest, src;
2622         __u16 destp, srcp;
2623
2624         dest  = tw->tw_daddr;
2625         src   = tw->tw_rcv_saddr;
2626         destp = ntohs(tw->tw_dport);
2627         srcp  = ntohs(tw->tw_sport);
2628
2629         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2630                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2631                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2632                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2633                 refcount_read(&tw->tw_refcnt), tw);
2634 }
2635
2636 #define TMPSZ 150
2637
2638 static int tcp4_seq_show(struct seq_file *seq, void *v)
2639 {
2640         struct tcp_iter_state *st;
2641         struct sock *sk = v;
2642
2643         seq_setwidth(seq, TMPSZ - 1);
2644         if (v == SEQ_START_TOKEN) {
2645                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2646                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2647                            "inode");
2648                 goto out;
2649         }
2650         st = seq->private;
2651
2652         if (sk->sk_state == TCP_TIME_WAIT)
2653                 get_timewait4_sock(v, seq, st->num);
2654         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2655                 get_openreq4(v, seq, st->num);
2656         else
2657                 get_tcp4_sock(v, seq, st->num);
2658 out:
2659         seq_pad(seq, '\n');
2660         return 0;
2661 }
2662
2663 #ifdef CONFIG_BPF_SYSCALL
2664 struct bpf_iter__tcp {
2665         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2666         __bpf_md_ptr(struct sock_common *, sk_common);
2667         uid_t uid __aligned(8);
2668 };
2669
2670 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2671                              struct sock_common *sk_common, uid_t uid)
2672 {
2673         struct bpf_iter__tcp ctx;
2674
2675         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2676         ctx.meta = meta;
2677         ctx.sk_common = sk_common;
2678         ctx.uid = uid;
2679         return bpf_iter_run_prog(prog, &ctx);
2680 }
2681
2682 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2683 {
2684         struct bpf_iter_meta meta;
2685         struct bpf_prog *prog;
2686         struct sock *sk = v;
2687         uid_t uid;
2688
2689         if (v == SEQ_START_TOKEN)
2690                 return 0;
2691
2692         if (sk->sk_state == TCP_TIME_WAIT) {
2693                 uid = 0;
2694         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2695                 const struct request_sock *req = v;
2696
2697                 uid = from_kuid_munged(seq_user_ns(seq),
2698                                        sock_i_uid(req->rsk_listener));
2699         } else {
2700                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2701         }
2702
2703         meta.seq = seq;
2704         prog = bpf_iter_get_info(&meta, false);
2705         return tcp_prog_seq_show(prog, &meta, v, uid);
2706 }
2707
2708 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2709 {
2710         struct bpf_iter_meta meta;
2711         struct bpf_prog *prog;
2712
2713         if (!v) {
2714                 meta.seq = seq;
2715                 prog = bpf_iter_get_info(&meta, true);
2716                 if (prog)
2717                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2718         }
2719
2720         tcp_seq_stop(seq, v);
2721 }
2722
2723 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2724         .show           = bpf_iter_tcp_seq_show,
2725         .start          = tcp_seq_start,
2726         .next           = tcp_seq_next,
2727         .stop           = bpf_iter_tcp_seq_stop,
2728 };
2729 #endif
2730
2731 static const struct seq_operations tcp4_seq_ops = {
2732         .show           = tcp4_seq_show,
2733         .start          = tcp_seq_start,
2734         .next           = tcp_seq_next,
2735         .stop           = tcp_seq_stop,
2736 };
2737
2738 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2739         .family         = AF_INET,
2740 };
2741
2742 static int __net_init tcp4_proc_init_net(struct net *net)
2743 {
2744         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2745                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2746                 return -ENOMEM;
2747         return 0;
2748 }
2749
2750 static void __net_exit tcp4_proc_exit_net(struct net *net)
2751 {
2752         remove_proc_entry("tcp", net->proc_net);
2753 }
2754
2755 static struct pernet_operations tcp4_net_ops = {
2756         .init = tcp4_proc_init_net,
2757         .exit = tcp4_proc_exit_net,
2758 };
2759
2760 int __init tcp4_proc_init(void)
2761 {
2762         return register_pernet_subsys(&tcp4_net_ops);
2763 }
2764
2765 void tcp4_proc_exit(void)
2766 {
2767         unregister_pernet_subsys(&tcp4_net_ops);
2768 }
2769 #endif /* CONFIG_PROC_FS */
2770
2771 /* @wake is one when sk_stream_write_space() calls us.
2772  * This sends EPOLLOUT only if notsent_bytes is half the limit.
2773  * This mimics the strategy used in sock_def_write_space().
2774  */
2775 bool tcp_stream_memory_free(const struct sock *sk, int wake)
2776 {
2777         const struct tcp_sock *tp = tcp_sk(sk);
2778         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2779                             READ_ONCE(tp->snd_nxt);
2780
2781         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2782 }
2783 EXPORT_SYMBOL(tcp_stream_memory_free);
2784
2785 struct proto tcp_prot = {
2786         .name                   = "TCP",
2787         .owner                  = THIS_MODULE,
2788         .close                  = tcp_close,
2789         .pre_connect            = tcp_v4_pre_connect,
2790         .connect                = tcp_v4_connect,
2791         .disconnect             = tcp_disconnect,
2792         .accept                 = inet_csk_accept,
2793         .ioctl                  = tcp_ioctl,
2794         .init                   = tcp_v4_init_sock,
2795         .destroy                = tcp_v4_destroy_sock,
2796         .shutdown               = tcp_shutdown,
2797         .setsockopt             = tcp_setsockopt,
2798         .getsockopt             = tcp_getsockopt,
2799         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
2800         .keepalive              = tcp_set_keepalive,
2801         .recvmsg                = tcp_recvmsg,
2802         .sendmsg                = tcp_sendmsg,
2803         .sendpage               = tcp_sendpage,
2804         .backlog_rcv            = tcp_v4_do_rcv,
2805         .release_cb             = tcp_release_cb,
2806         .hash                   = inet_hash,
2807         .unhash                 = inet_unhash,
2808         .get_port               = inet_csk_get_port,
2809         .enter_memory_pressure  = tcp_enter_memory_pressure,
2810         .leave_memory_pressure  = tcp_leave_memory_pressure,
2811         .stream_memory_free     = tcp_stream_memory_free,
2812         .sockets_allocated      = &tcp_sockets_allocated,
2813         .orphan_count           = &tcp_orphan_count,
2814         .memory_allocated       = &tcp_memory_allocated,
2815         .memory_pressure        = &tcp_memory_pressure,
2816         .sysctl_mem             = sysctl_tcp_mem,
2817         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2818         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2819         .max_header             = MAX_TCP_HEADER,
2820         .obj_size               = sizeof(struct tcp_sock),
2821         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2822         .twsk_prot              = &tcp_timewait_sock_ops,
2823         .rsk_prot               = &tcp_request_sock_ops,
2824         .h.hashinfo             = &tcp_hashinfo,
2825         .no_autobind            = true,
2826         .diag_destroy           = tcp_abort,
2827 };
2828 EXPORT_SYMBOL(tcp_prot);
2829
2830 static void __net_exit tcp_sk_exit(struct net *net)
2831 {
2832         int cpu;
2833
2834         if (net->ipv4.tcp_congestion_control)
2835                 bpf_module_put(net->ipv4.tcp_congestion_control,
2836                                net->ipv4.tcp_congestion_control->owner);
2837
2838         for_each_possible_cpu(cpu)
2839                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2840         free_percpu(net->ipv4.tcp_sk);
2841 }
2842
2843 static int __net_init tcp_sk_init(struct net *net)
2844 {
2845         int res, cpu, cnt;
2846
2847         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2848         if (!net->ipv4.tcp_sk)
2849                 return -ENOMEM;
2850
2851         for_each_possible_cpu(cpu) {
2852                 struct sock *sk;
2853
2854                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2855                                            IPPROTO_TCP, net);
2856                 if (res)
2857                         goto fail;
2858                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2859
2860                 /* Please enforce IP_DF and IPID==0 for RST and
2861                  * ACK sent in SYN-RECV and TIME-WAIT state.
2862                  */
2863                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2864
2865                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2866         }
2867
2868         net->ipv4.sysctl_tcp_ecn = 2;
2869         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2870
2871         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2872         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2873         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2874         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2875         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2876
2877         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2878         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2879         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2880
2881         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2882         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2883         net->ipv4.sysctl_tcp_syncookies = 1;
2884         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2885         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2886         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2887         net->ipv4.sysctl_tcp_orphan_retries = 0;
2888         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2889         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2890         net->ipv4.sysctl_tcp_tw_reuse = 2;
2891         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2892
2893         cnt = tcp_hashinfo.ehash_mask + 1;
2894         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2895         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2896
2897         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2898         net->ipv4.sysctl_tcp_sack = 1;
2899         net->ipv4.sysctl_tcp_window_scaling = 1;
2900         net->ipv4.sysctl_tcp_timestamps = 1;
2901         net->ipv4.sysctl_tcp_early_retrans = 3;
2902         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2903         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2904         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2905         net->ipv4.sysctl_tcp_max_reordering = 300;
2906         net->ipv4.sysctl_tcp_dsack = 1;
2907         net->ipv4.sysctl_tcp_app_win = 31;
2908         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2909         net->ipv4.sysctl_tcp_frto = 2;
2910         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2911         /* This limits the percentage of the congestion window which we
2912          * will allow a single TSO frame to consume.  Building TSO frames
2913          * which are too large can cause TCP streams to be bursty.
2914          */
2915         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2916         /* Default TSQ limit of 16 TSO segments */
2917         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2918         /* rfc5961 challenge ack rate limiting */
2919         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2920         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2921         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2922         net->ipv4.sysctl_tcp_autocorking = 1;
2923         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2924         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2925         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2926         if (net != &init_net) {
2927                 memcpy(net->ipv4.sysctl_tcp_rmem,
2928                        init_net.ipv4.sysctl_tcp_rmem,
2929                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2930                 memcpy(net->ipv4.sysctl_tcp_wmem,
2931                        init_net.ipv4.sysctl_tcp_wmem,
2932                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2933         }
2934         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2935         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2936         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2937         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2938         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2939         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2940         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2941
2942         /* Reno is always built in */
2943         if (!net_eq(net, &init_net) &&
2944             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2945                                init_net.ipv4.tcp_congestion_control->owner))
2946                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2947         else
2948                 net->ipv4.tcp_congestion_control = &tcp_reno;
2949
2950         return 0;
2951 fail:
2952         tcp_sk_exit(net);
2953
2954         return res;
2955 }
2956
2957 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2958 {
2959         struct net *net;
2960
2961         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2962
2963         list_for_each_entry(net, net_exit_list, exit_list)
2964                 tcp_fastopen_ctx_destroy(net);
2965 }
2966
2967 static struct pernet_operations __net_initdata tcp_sk_ops = {
2968        .init       = tcp_sk_init,
2969        .exit       = tcp_sk_exit,
2970        .exit_batch = tcp_sk_exit_batch,
2971 };
2972
2973 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2974 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2975                      struct sock_common *sk_common, uid_t uid)
2976
2977 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2978 {
2979         struct tcp_iter_state *st = priv_data;
2980         struct tcp_seq_afinfo *afinfo;
2981         int ret;
2982
2983         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2984         if (!afinfo)
2985                 return -ENOMEM;
2986
2987         afinfo->family = AF_UNSPEC;
2988         st->bpf_seq_afinfo = afinfo;
2989         ret = bpf_iter_init_seq_net(priv_data, aux);
2990         if (ret)
2991                 kfree(afinfo);
2992         return ret;
2993 }
2994
2995 static void bpf_iter_fini_tcp(void *priv_data)
2996 {
2997         struct tcp_iter_state *st = priv_data;
2998
2999         kfree(st->bpf_seq_afinfo);
3000         bpf_iter_fini_seq_net(priv_data);
3001 }
3002
3003 static const struct bpf_iter_seq_info tcp_seq_info = {
3004         .seq_ops                = &bpf_iter_tcp_seq_ops,
3005         .init_seq_private       = bpf_iter_init_tcp,
3006         .fini_seq_private       = bpf_iter_fini_tcp,
3007         .seq_priv_size          = sizeof(struct tcp_iter_state),
3008 };
3009
3010 static struct bpf_iter_reg tcp_reg_info = {
3011         .target                 = "tcp",
3012         .ctx_arg_info_size      = 1,
3013         .ctx_arg_info           = {
3014                 { offsetof(struct bpf_iter__tcp, sk_common),
3015                   PTR_TO_BTF_ID_OR_NULL },
3016         },
3017         .seq_info               = &tcp_seq_info,
3018 };
3019
3020 static void __init bpf_iter_register(void)
3021 {
3022         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3023         if (bpf_iter_reg_target(&tcp_reg_info))
3024                 pr_warn("Warning: could not register bpf iterator tcp\n");
3025 }
3026
3027 #endif
3028
3029 void __init tcp_v4_init(void)
3030 {
3031         if (register_pernet_subsys(&tcp_sk_ops))
3032                 panic("Failed to create the TCP control socket.\n");
3033
3034 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3035         bpf_iter_register();
3036 #endif
3037 }