net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = tcp_sk(sk)->mtu_info;
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         tp->mtu_info = info;
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk->sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk->sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 659 {
 660         const struct tcphdr *th = tcp_hdr(skb);
 661         struct {
 662                 struct tcphdr th;
 663 #ifdef CONFIG_TCP_MD5SIG
 664                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 665 #endif
 666         } rep;
 667         struct ip_reply_arg arg;
 668 #ifdef CONFIG_TCP_MD5SIG
 669         struct tcp_md5sig_key *key = NULL;
 670         const __u8 *hash_location = NULL;
 671         unsigned char newhash[16];
 672         int genhash;
 673         struct sock *sk1 = NULL;
 674 #endif
 675         u64 transmit_time = 0;
 676         struct sock *ctl_sk;
 677         struct net *net;
 678
 679         /* Never send a reset in response to a reset. */
 680         if (th->rst)
 681                 return;
 682
 683         /* If sk not NULL, it means we did a successful lookup and incoming
 684          * route had to be correct. prequeue might have dropped our dst.
 685          */
 686         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rep, 0, sizeof(rep));
 691         rep.th.dest   = th->source;
 692         rep.th.source = th->dest;
 693         rep.th.doff   = sizeof(struct tcphdr) / 4;
 694         rep.th.rst    = 1;
 695
 696         if (th->ack) {
 697                 rep.th.seq = th->ack_seq;
 698         } else {
 699                 rep.th.ack = 1;
 700                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                        skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof(arg));
 705         arg.iov[0].iov_base = (unsigned char *)&rep;
 706         arg.iov[0].iov_len  = sizeof(rep.th);
 707
 708         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 709 #ifdef CONFIG_TCP_MD5SIG
 710         rcu_read_lock();
 711         hash_location = tcp_parse_md5sig_option(th);
 712         if (sk && sk_fullsock(sk)) {
 713                 const union tcp_md5_addr *addr;
 714                 int l3index;
 715
 716                 /* sdif set, means packet ingressed via a device
 717                  * in an L3 domain and inet_iif is set to it.
 718                  */
 719                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 720                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 721                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 722         } else if (hash_location) {
 723                 const union tcp_md5_addr *addr;
 724                 int sdif = tcp_v4_sdif(skb);
 725                 int dif = inet_iif(skb);
 726                 int l3index;
 727
 728                 /*
 729                  * active side is lost. Try to find listening socket through
 730                  * source port, and then find md5 key through listening socket.
 731                  * we are not loose security here:
 732                  * Incoming packet is checked with md5 hash with finding key,
 733                  * no RST generated if md5 hash doesn't match.
 734                  */
 735                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 736                                              ip_hdr(skb)->saddr,
 737                                              th->source, ip_hdr(skb)->daddr,
 738                                              ntohs(th->source), dif, sdif);
 739                 /* don't send rst if it can't find key */
 740                 if (!sk1)
 741                         goto out;
 742
 743                 /* sdif set, means packet ingressed via a device
 744                  * in an L3 domain and dif is set to it.
 745                  */
 746                 l3index = sdif ? dif : 0;
 747                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 748                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 749                 if (!key)
 750                         goto out;
 751
 752
 753                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 754                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 755                         goto out;
 756
 757         }
 758
 759         if (key) {
 760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 761                                    (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_MD5SIG << 8) |
 763                                    TCPOLEN_MD5SIG);
 764                 /* Update length and the length the header thinks exists */
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len / 4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 769                                      key, ip_hdr(skb)->saddr,
 770                                      ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 774                                       ip_hdr(skb)->saddr, /* XXX */
 775                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 776         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 777         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 778
 779         /* When socket is gone, all binding information is lost.
 780          * routing might fail in this case. No choice here, if we choose to force
 781          * input interface, we will misroute in case of asymmetric route.
 782          */
 783         if (sk) {
 784                 arg.bound_dev_if = sk->sk_bound_dev_if;
 785                 if (sk_fullsock(sk))
 786                         trace_tcp_send_reset(sk, skb);
 787         }
 788
 789         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 790                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 791
 792         arg.tos = ip_hdr(skb)->tos;
 793         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 794         local_bh_disable();
 795         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 796         if (sk) {
 797                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 798                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 799                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 800                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 801                 transmit_time = tcp_transmit_time(sk);
 802         }
 803         ip_send_unicast_reply(ctl_sk,
 804                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 805                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 806                               &arg, arg.iov[0].iov_len,
 807                               transmit_time);
 808
 809         ctl_sk->sk_mark = 0;
 810         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 811         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 812         local_bh_enable();
 813
 814 #ifdef CONFIG_TCP_MD5SIG
 815 out:
 816         rcu_read_unlock();
 817 #endif
 818 }
 819
 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 821    outside socket context is ugly, certainly. What can I do?
 822  */
 823
 824 static void tcp_v4_send_ack(const struct sock *sk,
 825                             struct sk_buff *skb, u32 seq, u32 ack,
 826                             u32 win, u32 tsval, u32 tsecr, int oif,
 827                             struct tcp_md5sig_key *key,
 828                             int reply_flags, u8 tos)
 829 {
 830         const struct tcphdr *th = tcp_hdr(skb);
 831         struct {
 832                 struct tcphdr th;
 833                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 834 #ifdef CONFIG_TCP_MD5SIG
 835                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 836 #endif
 837                         ];
 838         } rep;
 839         struct net *net = sock_net(sk);
 840         struct ip_reply_arg arg;
 841         struct sock *ctl_sk;
 842         u64 transmit_time;
 843
 844         memset(&rep.th, 0, sizeof(struct tcphdr));
 845         memset(&arg, 0, sizeof(arg));
 846
 847         arg.iov[0].iov_base = (unsigned char *)&rep;
 848         arg.iov[0].iov_len  = sizeof(rep.th);
 849         if (tsecr) {
 850                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 851                                    (TCPOPT_TIMESTAMP << 8) |
 852                                    TCPOLEN_TIMESTAMP);
 853                 rep.opt[1] = htonl(tsval);
 854                 rep.opt[2] = htonl(tsecr);
 855                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 856         }
 857
 858         /* Swap the send and the receive. */
 859         rep.th.dest    = th->source;
 860         rep.th.source  = th->dest;
 861         rep.th.doff    = arg.iov[0].iov_len / 4;
 862         rep.th.seq     = htonl(seq);
 863         rep.th.ack_seq = htonl(ack);
 864         rep.th.ack     = 1;
 865         rep.th.window  = htons(win);
 866
 867 #ifdef CONFIG_TCP_MD5SIG
 868         if (key) {
 869                 int offset = (tsecr) ? 3 : 0;
 870
 871                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 872                                           (TCPOPT_NOP << 16) |
 873                                           (TCPOPT_MD5SIG << 8) |
 874                                           TCPOLEN_MD5SIG);
 875                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 876                 rep.th.doff = arg.iov[0].iov_len/4;
 877
 878                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 879                                     key, ip_hdr(skb)->saddr,
 880                                     ip_hdr(skb)->daddr, &rep.th);
 881         }
 882 #endif
 883         arg.flags = reply_flags;
 884         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 885                                       ip_hdr(skb)->saddr, /* XXX */
 886                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 887         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 888         if (oif)
 889                 arg.bound_dev_if = oif;
 890         arg.tos = tos;
 891         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 892         local_bh_disable();
 893         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 894         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 895                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 896         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 897                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 898         transmit_time = tcp_transmit_time(sk);
 899         ip_send_unicast_reply(ctl_sk,
 900                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 901                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 902                               &arg, arg.iov[0].iov_len,
 903                               transmit_time);
 904
 905         ctl_sk->sk_mark = 0;
 906         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 907         local_bh_enable();
 908 }
 909
 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 911 {
 912         struct inet_timewait_sock *tw = inet_twsk(sk);
 913         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 914
 915         tcp_v4_send_ack(sk, skb,
 916                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 917                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 918                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 919                         tcptw->tw_ts_recent,
 920                         tw->tw_bound_dev_if,
 921                         tcp_twsk_md5_key(tcptw),
 922                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                         tw->tw_tos
 924                         );
 925
 926         inet_twsk_put(tw);
 927 }
 928
 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 930                                   struct request_sock *req)
 931 {
 932         const union tcp_md5_addr *addr;
 933         int l3index;
 934
 935         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 936          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 937          */
 938         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 939                                              tcp_sk(sk)->snd_nxt;
 940
 941         /* RFC 7323 2.3
 942          * The window field (SEG.WND) of every outgoing segment, with the
 943          * exception of <SYN> segments, MUST be right-shifted by
 944          * Rcv.Wind.Shift bits:
 945          */
 946         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 947         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 948         tcp_v4_send_ack(sk, skb, seq,
 949                         tcp_rsk(req)->rcv_nxt,
 950                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 951                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 952                         req->ts_recent,
 953                         0,
 954                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 955                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 956                         ip_hdr(skb)->tos);
 957 }
 958
 959 /*
 960  *      Send a SYN-ACK after having received a SYN.
 961  *      This still operates on a request_sock only, not on a big
 962  *      socket.
 963  */
 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 965                               struct flowi *fl,
 966                               struct request_sock *req,
 967                               struct tcp_fastopen_cookie *foc,
 968                               enum tcp_synack_type synack_type,
 969                               struct sk_buff *syn_skb)
 970 {
 971         const struct inet_request_sock *ireq = inet_rsk(req);
 972         struct flowi4 fl4;
 973         int err = -1;
 974         struct sk_buff *skb;
 975         u8 tos;
 976
 977         /* First, grab a route. */
 978         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 979                 return -1;
 980
 981         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 982
 983         if (skb) {
 984                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 985
 986                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
 987                                 tcp_rsk(req)->syn_tos & ~INET_ECN_MASK :
 988                                 inet_sk(sk)->tos;
 989
 990                 if (!INET_ECN_is_capable(tos) &&
 991                     tcp_bpf_ca_needs_ecn((struct sock *)req))
 992                         tos |= INET_ECN_ECT_0;
 993
 994                 rcu_read_lock();
 995                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 996                                             ireq->ir_rmt_addr,
 997                                             rcu_dereference(ireq->ireq_opt),
 998                                             tos);
 999                 rcu_read_unlock();
1000                 err = net_xmit_eval(err);
1001         }
1002
1003         return err;
1004 }
1005
1006 /*
1007  *      IPv4 request_sock destructor.
1008  */
1009 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1010 {
1011         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1012 }
1013
1014 #ifdef CONFIG_TCP_MD5SIG
1015 /*
1016  * RFC2385 MD5 checksumming requires a mapping of
1017  * IP address->MD5 Key.
1018  * We need to maintain these in the sk structure.
1019  */
1020
1021 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1022 EXPORT_SYMBOL(tcp_md5_needed);
1023
1024 /* Find the Key structure for an address.  */
1025 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1026                                            const union tcp_md5_addr *addr,
1027                                            int family)
1028 {
1029         const struct tcp_sock *tp = tcp_sk(sk);
1030         struct tcp_md5sig_key *key;
1031         const struct tcp_md5sig_info *md5sig;
1032         __be32 mask;
1033         struct tcp_md5sig_key *best_match = NULL;
1034         bool match;
1035
1036         /* caller either holds rcu_read_lock() or socket lock */
1037         md5sig = rcu_dereference_check(tp->md5sig_info,
1038                                        lockdep_sock_is_held(sk));
1039         if (!md5sig)
1040                 return NULL;
1041
1042         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1043                                  lockdep_sock_is_held(sk)) {
1044                 if (key->family != family)
1045                         continue;
1046                 if (key->l3index && key->l3index != l3index)
1047                         continue;
1048                 if (family == AF_INET) {
1049                         mask = inet_make_mask(key->prefixlen);
1050                         match = (key->addr.a4.s_addr & mask) ==
1051                                 (addr->a4.s_addr & mask);
1052 #if IS_ENABLED(CONFIG_IPV6)
1053                 } else if (family == AF_INET6) {
1054                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1055                                                   key->prefixlen);
1056 #endif
1057                 } else {
1058                         match = false;
1059                 }
1060
1061                 if (match && (!best_match ||
1062                               key->prefixlen > best_match->prefixlen))
1063                         best_match = key;
1064         }
1065         return best_match;
1066 }
1067 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1068
1069 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1070                                                       const union tcp_md5_addr *addr,
1071                                                       int family, u8 prefixlen,
1072                                                       int l3index)
1073 {
1074         const struct tcp_sock *tp = tcp_sk(sk);
1075         struct tcp_md5sig_key *key;
1076         unsigned int size = sizeof(struct in_addr);
1077         const struct tcp_md5sig_info *md5sig;
1078
1079         /* caller either holds rcu_read_lock() or socket lock */
1080         md5sig = rcu_dereference_check(tp->md5sig_info,
1081                                        lockdep_sock_is_held(sk));
1082         if (!md5sig)
1083                 return NULL;
1084 #if IS_ENABLED(CONFIG_IPV6)
1085         if (family == AF_INET6)
1086                 size = sizeof(struct in6_addr);
1087 #endif
1088         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1089                                  lockdep_sock_is_held(sk)) {
1090                 if (key->family != family)
1091                         continue;
1092                 if (key->l3index && key->l3index != l3index)
1093                         continue;
1094                 if (!memcmp(&key->addr, addr, size) &&
1095                     key->prefixlen == prefixlen)
1096                         return key;
1097         }
1098         return NULL;
1099 }
1100
1101 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1102                                          const struct sock *addr_sk)
1103 {
1104         const union tcp_md5_addr *addr;
1105         int l3index;
1106
1107         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1108                                                  addr_sk->sk_bound_dev_if);
1109         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1110         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1111 }
1112 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1113
1114 /* This can be called on a newly created socket, from other files */
1115 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1116                    int family, u8 prefixlen, int l3index,
1117                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1118 {
1119         /* Add Key to the list */
1120         struct tcp_md5sig_key *key;
1121         struct tcp_sock *tp = tcp_sk(sk);
1122         struct tcp_md5sig_info *md5sig;
1123
1124         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1125         if (key) {
1126                 /* Pre-existing entry - just update that one.
1127                  * Note that the key might be used concurrently.
1128                  * data_race() is telling kcsan that we do not care of
1129                  * key mismatches, since changing MD5 key on live flows
1130                  * can lead to packet drops.
1131                  */
1132                 data_race(memcpy(key->key, newkey, newkeylen));
1133
1134                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1135                  * Also note that a reader could catch new key->keylen value
1136                  * but old key->key[], this is the reason we use __GFP_ZERO
1137                  * at sock_kmalloc() time below these lines.
1138                  */
1139                 WRITE_ONCE(key->keylen, newkeylen);
1140
1141                 return 0;
1142         }
1143
1144         md5sig = rcu_dereference_protected(tp->md5sig_info,
1145                                            lockdep_sock_is_held(sk));
1146         if (!md5sig) {
1147                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1148                 if (!md5sig)
1149                         return -ENOMEM;
1150
1151                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1152                 INIT_HLIST_HEAD(&md5sig->head);
1153                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1154         }
1155
1156         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1157         if (!key)
1158                 return -ENOMEM;
1159         if (!tcp_alloc_md5sig_pool()) {
1160                 sock_kfree_s(sk, key, sizeof(*key));
1161                 return -ENOMEM;
1162         }
1163
1164         memcpy(key->key, newkey, newkeylen);
1165         key->keylen = newkeylen;
1166         key->family = family;
1167         key->prefixlen = prefixlen;
1168         key->l3index = l3index;
1169         memcpy(&key->addr, addr,
1170                (family == AF_INET6) ? sizeof(struct in6_addr) :
1171                                       sizeof(struct in_addr));
1172         hlist_add_head_rcu(&key->node, &md5sig->head);
1173         return 0;
1174 }
1175 EXPORT_SYMBOL(tcp_md5_do_add);
1176
1177 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1178                    u8 prefixlen, int l3index)
1179 {
1180         struct tcp_md5sig_key *key;
1181
1182         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1183         if (!key)
1184                 return -ENOENT;
1185         hlist_del_rcu(&key->node);
1186         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1187         kfree_rcu(key, rcu);
1188         return 0;
1189 }
1190 EXPORT_SYMBOL(tcp_md5_do_del);
1191
1192 static void tcp_clear_md5_list(struct sock *sk)
1193 {
1194         struct tcp_sock *tp = tcp_sk(sk);
1195         struct tcp_md5sig_key *key;
1196         struct hlist_node *n;
1197         struct tcp_md5sig_info *md5sig;
1198
1199         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1200
1201         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1202                 hlist_del_rcu(&key->node);
1203                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1204                 kfree_rcu(key, rcu);
1205         }
1206 }
1207
1208 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1209                                  sockptr_t optval, int optlen)
1210 {
1211         struct tcp_md5sig cmd;
1212         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1213         const union tcp_md5_addr *addr;
1214         u8 prefixlen = 32;
1215         int l3index = 0;
1216
1217         if (optlen < sizeof(cmd))
1218                 return -EINVAL;
1219
1220         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1221                 return -EFAULT;
1222
1223         if (sin->sin_family != AF_INET)
1224                 return -EINVAL;
1225
1226         if (optname == TCP_MD5SIG_EXT &&
1227             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1228                 prefixlen = cmd.tcpm_prefixlen;
1229                 if (prefixlen > 32)
1230                         return -EINVAL;
1231         }
1232
1233         if (optname == TCP_MD5SIG_EXT &&
1234             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1235                 struct net_device *dev;
1236
1237                 rcu_read_lock();
1238                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1239                 if (dev && netif_is_l3_master(dev))
1240                         l3index = dev->ifindex;
1241
1242                 rcu_read_unlock();
1243
1244                 /* ok to reference set/not set outside of rcu;
1245                  * right now device MUST be an L3 master
1246                  */
1247                 if (!dev || !l3index)
1248                         return -EINVAL;
1249         }
1250
1251         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1252
1253         if (!cmd.tcpm_keylen)
1254                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1255
1256         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1257                 return -EINVAL;
1258
1259         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1260                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1261 }
1262
1263 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1264                                    __be32 daddr, __be32 saddr,
1265                                    const struct tcphdr *th, int nbytes)
1266 {
1267         struct tcp4_pseudohdr *bp;
1268         struct scatterlist sg;
1269         struct tcphdr *_th;
1270
1271         bp = hp->scratch;
1272         bp->saddr = saddr;
1273         bp->daddr = daddr;
1274         bp->pad = 0;
1275         bp->protocol = IPPROTO_TCP;
1276         bp->len = cpu_to_be16(nbytes);
1277
1278         _th = (struct tcphdr *)(bp + 1);
1279         memcpy(_th, th, sizeof(*th));
1280         _th->check = 0;
1281
1282         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1283         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1284                                 sizeof(*bp) + sizeof(*th));
1285         return crypto_ahash_update(hp->md5_req);
1286 }
1287
1288 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1289                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1290 {
1291         struct tcp_md5sig_pool *hp;
1292         struct ahash_request *req;
1293
1294         hp = tcp_get_md5sig_pool();
1295         if (!hp)
1296                 goto clear_hash_noput;
1297         req = hp->md5_req;
1298
1299         if (crypto_ahash_init(req))
1300                 goto clear_hash;
1301         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1302                 goto clear_hash;
1303         if (tcp_md5_hash_key(hp, key))
1304                 goto clear_hash;
1305         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1306         if (crypto_ahash_final(req))
1307                 goto clear_hash;
1308
1309         tcp_put_md5sig_pool();
1310         return 0;
1311
1312 clear_hash:
1313         tcp_put_md5sig_pool();
1314 clear_hash_noput:
1315         memset(md5_hash, 0, 16);
1316         return 1;
1317 }
1318
1319 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1320                         const struct sock *sk,
1321                         const struct sk_buff *skb)
1322 {
1323         struct tcp_md5sig_pool *hp;
1324         struct ahash_request *req;
1325         const struct tcphdr *th = tcp_hdr(skb);
1326         __be32 saddr, daddr;
1327
1328         if (sk) { /* valid for establish/request sockets */
1329                 saddr = sk->sk_rcv_saddr;
1330                 daddr = sk->sk_daddr;
1331         } else {
1332                 const struct iphdr *iph = ip_hdr(skb);
1333                 saddr = iph->saddr;
1334                 daddr = iph->daddr;
1335         }
1336
1337         hp = tcp_get_md5sig_pool();
1338         if (!hp)
1339                 goto clear_hash_noput;
1340         req = hp->md5_req;
1341
1342         if (crypto_ahash_init(req))
1343                 goto clear_hash;
1344
1345         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1346                 goto clear_hash;
1347         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1348                 goto clear_hash;
1349         if (tcp_md5_hash_key(hp, key))
1350                 goto clear_hash;
1351         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1352         if (crypto_ahash_final(req))
1353                 goto clear_hash;
1354
1355         tcp_put_md5sig_pool();
1356         return 0;
1357
1358 clear_hash:
1359         tcp_put_md5sig_pool();
1360 clear_hash_noput:
1361         memset(md5_hash, 0, 16);
1362         return 1;
1363 }
1364 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1365
1366 #endif
1367
1368 /* Called with rcu_read_lock() */
1369 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1370                                     const struct sk_buff *skb,
1371                                     int dif, int sdif)
1372 {
1373 #ifdef CONFIG_TCP_MD5SIG
1374         /*
1375          * This gets called for each TCP segment that arrives
1376          * so we want to be efficient.
1377          * We have 3 drop cases:
1378          * o No MD5 hash and one expected.
1379          * o MD5 hash and we're not expecting one.
1380          * o MD5 hash and its wrong.
1381          */
1382         const __u8 *hash_location = NULL;
1383         struct tcp_md5sig_key *hash_expected;
1384         const struct iphdr *iph = ip_hdr(skb);
1385         const struct tcphdr *th = tcp_hdr(skb);
1386         const union tcp_md5_addr *addr;
1387         unsigned char newhash[16];
1388         int genhash, l3index;
1389
1390         /* sdif set, means packet ingressed via a device
1391          * in an L3 domain and dif is set to the l3mdev
1392          */
1393         l3index = sdif ? dif : 0;
1394
1395         addr = (union tcp_md5_addr *)&iph->saddr;
1396         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1397         hash_location = tcp_parse_md5sig_option(th);
1398
1399         /* We've parsed the options - do we have a hash? */
1400         if (!hash_expected && !hash_location)
1401                 return false;
1402
1403         if (hash_expected && !hash_location) {
1404                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1405                 return true;
1406         }
1407
1408         if (!hash_expected && hash_location) {
1409                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1410                 return true;
1411         }
1412
1413         /* Okay, so this is hash_expected and hash_location -
1414          * so we need to calculate the checksum.
1415          */
1416         genhash = tcp_v4_md5_hash_skb(newhash,
1417                                       hash_expected,
1418                                       NULL, skb);
1419
1420         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1421                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1422                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1423                                      &iph->saddr, ntohs(th->source),
1424                                      &iph->daddr, ntohs(th->dest),
1425                                      genhash ? " tcp_v4_calc_md5_hash failed"
1426                                      : "", l3index);
1427                 return true;
1428         }
1429         return false;
1430 #endif
1431         return false;
1432 }
1433
1434 static void tcp_v4_init_req(struct request_sock *req,
1435                             const struct sock *sk_listener,
1436                             struct sk_buff *skb)
1437 {
1438         struct inet_request_sock *ireq = inet_rsk(req);
1439         struct net *net = sock_net(sk_listener);
1440
1441         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1442         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1443         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1444 }
1445
1446 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1447                                           struct flowi *fl,
1448                                           const struct request_sock *req)
1449 {
1450         return inet_csk_route_req(sk, &fl->u.ip4, req);
1451 }
1452
1453 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1454         .family         =       PF_INET,
1455         .obj_size       =       sizeof(struct tcp_request_sock),
1456         .rtx_syn_ack    =       tcp_rtx_synack,
1457         .send_ack       =       tcp_v4_reqsk_send_ack,
1458         .destructor     =       tcp_v4_reqsk_destructor,
1459         .send_reset     =       tcp_v4_send_reset,
1460         .syn_ack_timeout =      tcp_syn_ack_timeout,
1461 };
1462
1463 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1464         .mss_clamp      =       TCP_MSS_DEFAULT,
1465 #ifdef CONFIG_TCP_MD5SIG
1466         .req_md5_lookup =       tcp_v4_md5_lookup,
1467         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1468 #endif
1469         .init_req       =       tcp_v4_init_req,
1470 #ifdef CONFIG_SYN_COOKIES
1471         .cookie_init_seq =      cookie_v4_init_sequence,
1472 #endif
1473         .route_req      =       tcp_v4_route_req,
1474         .init_seq       =       tcp_v4_init_seq,
1475         .init_ts_off    =       tcp_v4_init_ts_off,
1476         .send_synack    =       tcp_v4_send_synack,
1477 };
1478
1479 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1480 {
1481         /* Never answer to SYNs send to broadcast or multicast */
1482         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1483                 goto drop;
1484
1485         return tcp_conn_request(&tcp_request_sock_ops,
1486                                 &tcp_request_sock_ipv4_ops, sk, skb);
1487
1488 drop:
1489         tcp_listendrop(sk);
1490         return 0;
1491 }
1492 EXPORT_SYMBOL(tcp_v4_conn_request);
1493
1494
1495 /*
1496  * The three way handshake has completed - we got a valid synack -
1497  * now create the new socket.
1498  */
1499 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1500                                   struct request_sock *req,
1501                                   struct dst_entry *dst,
1502                                   struct request_sock *req_unhash,
1503                                   bool *own_req)
1504 {
1505         struct inet_request_sock *ireq;
1506         bool found_dup_sk = false;
1507         struct inet_sock *newinet;
1508         struct tcp_sock *newtp;
1509         struct sock *newsk;
1510 #ifdef CONFIG_TCP_MD5SIG
1511         const union tcp_md5_addr *addr;
1512         struct tcp_md5sig_key *key;
1513         int l3index;
1514 #endif
1515         struct ip_options_rcu *inet_opt;
1516
1517         if (sk_acceptq_is_full(sk))
1518                 goto exit_overflow;
1519
1520         newsk = tcp_create_openreq_child(sk, req, skb);
1521         if (!newsk)
1522                 goto exit_nonewsk;
1523
1524         newsk->sk_gso_type = SKB_GSO_TCPV4;
1525         inet_sk_rx_dst_set(newsk, skb);
1526
1527         newtp                 = tcp_sk(newsk);
1528         newinet               = inet_sk(newsk);
1529         ireq                  = inet_rsk(req);
1530         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1531         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1532         newsk->sk_bound_dev_if = ireq->ir_iif;
1533         newinet->inet_saddr   = ireq->ir_loc_addr;
1534         inet_opt              = rcu_dereference(ireq->ireq_opt);
1535         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1536         newinet->mc_index     = inet_iif(skb);
1537         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1538         newinet->rcv_tos      = ip_hdr(skb)->tos;
1539         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1540         if (inet_opt)
1541                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1542         newinet->inet_id = prandom_u32();
1543
1544         /* Set ToS of the new socket based upon the value of incoming SYN. */
1545         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1546                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1547
1548         if (!dst) {
1549                 dst = inet_csk_route_child_sock(sk, newsk, req);
1550                 if (!dst)
1551                         goto put_and_exit;
1552         } else {
1553                 /* syncookie case : see end of cookie_v4_check() */
1554         }
1555         sk_setup_caps(newsk, dst);
1556
1557         tcp_ca_openreq_child(newsk, dst);
1558
1559         tcp_sync_mss(newsk, dst_mtu(dst));
1560         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1561
1562         tcp_initialize_rcv_mss(newsk);
1563
1564 #ifdef CONFIG_TCP_MD5SIG
1565         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1566         /* Copy over the MD5 key from the original socket */
1567         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1568         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1569         if (key) {
1570                 /*
1571                  * We're using one, so create a matching key
1572                  * on the newsk structure. If we fail to get
1573                  * memory, then we end up not copying the key
1574                  * across. Shucks.
1575                  */
1576                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1577                                key->key, key->keylen, GFP_ATOMIC);
1578                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1579         }
1580 #endif
1581
1582         if (__inet_inherit_port(sk, newsk) < 0)
1583                 goto put_and_exit;
1584         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1585                                        &found_dup_sk);
1586         if (likely(*own_req)) {
1587                 tcp_move_syn(newtp, req);
1588                 ireq->ireq_opt = NULL;
1589         } else {
1590                 if (!req_unhash && found_dup_sk) {
1591                         /* This code path should only be executed in the
1592                          * syncookie case only
1593                          */
1594                         bh_unlock_sock(newsk);
1595                         sock_put(newsk);
1596                         newsk = NULL;
1597                 } else {
1598                         newinet->inet_opt = NULL;
1599                 }
1600         }
1601         return newsk;
1602
1603 exit_overflow:
1604         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1605 exit_nonewsk:
1606         dst_release(dst);
1607 exit:
1608         tcp_listendrop(sk);
1609         return NULL;
1610 put_and_exit:
1611         newinet->inet_opt = NULL;
1612         inet_csk_prepare_forced_close(newsk);
1613         tcp_done(newsk);
1614         goto exit;
1615 }
1616 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1617
1618 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1619 {
1620 #ifdef CONFIG_SYN_COOKIES
1621         const struct tcphdr *th = tcp_hdr(skb);
1622
1623         if (!th->syn)
1624                 sk = cookie_v4_check(sk, skb);
1625 #endif
1626         return sk;
1627 }
1628
1629 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1630                          struct tcphdr *th, u32 *cookie)
1631 {
1632         u16 mss = 0;
1633 #ifdef CONFIG_SYN_COOKIES
1634         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1635                                     &tcp_request_sock_ipv4_ops, sk, th);
1636         if (mss) {
1637                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1638                 tcp_synq_overflow(sk);
1639         }
1640 #endif
1641         return mss;
1642 }
1643
1644 /* The socket must have it's spinlock held when we get
1645  * here, unless it is a TCP_LISTEN socket.
1646  *
1647  * We have a potential double-lock case here, so even when
1648  * doing backlog processing we use the BH locking scheme.
1649  * This is because we cannot sleep with the original spinlock
1650  * held.
1651  */
1652 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1653 {
1654         struct sock *rsk;
1655
1656         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1657                 struct dst_entry *dst = sk->sk_rx_dst;
1658
1659                 sock_rps_save_rxhash(sk, skb);
1660                 sk_mark_napi_id(sk, skb);
1661                 if (dst) {
1662                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1663                             !dst->ops->check(dst, 0)) {
1664                                 dst_release(dst);
1665                                 sk->sk_rx_dst = NULL;
1666                         }
1667                 }
1668                 tcp_rcv_established(sk, skb);
1669                 return 0;
1670         }
1671
1672         if (tcp_checksum_complete(skb))
1673                 goto csum_err;
1674
1675         if (sk->sk_state == TCP_LISTEN) {
1676                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1677
1678                 if (!nsk)
1679                         goto discard;
1680                 if (nsk != sk) {
1681                         if (tcp_child_process(sk, nsk, skb)) {
1682                                 rsk = nsk;
1683                                 goto reset;
1684                         }
1685                         return 0;
1686                 }
1687         } else
1688                 sock_rps_save_rxhash(sk, skb);
1689
1690         if (tcp_rcv_state_process(sk, skb)) {
1691                 rsk = sk;
1692                 goto reset;
1693         }
1694         return 0;
1695
1696 reset:
1697         tcp_v4_send_reset(rsk, skb);
1698 discard:
1699         kfree_skb(skb);
1700         /* Be careful here. If this function gets more complicated and
1701          * gcc suffers from register pressure on the x86, sk (in %ebx)
1702          * might be destroyed here. This current version compiles correctly,
1703          * but you have been warned.
1704          */
1705         return 0;
1706
1707 csum_err:
1708         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1709         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1710         goto discard;
1711 }
1712 EXPORT_SYMBOL(tcp_v4_do_rcv);
1713
1714 int tcp_v4_early_demux(struct sk_buff *skb)
1715 {
1716         const struct iphdr *iph;
1717         const struct tcphdr *th;
1718         struct sock *sk;
1719
1720         if (skb->pkt_type != PACKET_HOST)
1721                 return 0;
1722
1723         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1724                 return 0;
1725
1726         iph = ip_hdr(skb);
1727         th = tcp_hdr(skb);
1728
1729         if (th->doff < sizeof(struct tcphdr) / 4)
1730                 return 0;
1731
1732         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1733                                        iph->saddr, th->source,
1734                                        iph->daddr, ntohs(th->dest),
1735                                        skb->skb_iif, inet_sdif(skb));
1736         if (sk) {
1737                 skb->sk = sk;
1738                 skb->destructor = sock_edemux;
1739                 if (sk_fullsock(sk)) {
1740                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1741
1742                         if (dst)
1743                                 dst = dst_check(dst, 0);
1744                         if (dst &&
1745                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1746                                 skb_dst_set_noref(skb, dst);
1747                 }
1748         }
1749         return 0;
1750 }
1751
1752 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1753 {
1754         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1755         struct skb_shared_info *shinfo;
1756         const struct tcphdr *th;
1757         struct tcphdr *thtail;
1758         struct sk_buff *tail;
1759         unsigned int hdrlen;
1760         bool fragstolen;
1761         u32 gso_segs;
1762         int delta;
1763
1764         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1765          * we can fix skb->truesize to its real value to avoid future drops.
1766          * This is valid because skb is not yet charged to the socket.
1767          * It has been noticed pure SACK packets were sometimes dropped
1768          * (if cooked by drivers without copybreak feature).
1769          */
1770         skb_condense(skb);
1771
1772         skb_dst_drop(skb);
1773
1774         if (unlikely(tcp_checksum_complete(skb))) {
1775                 bh_unlock_sock(sk);
1776                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778                 return true;
1779         }
1780
1781         /* Attempt coalescing to last skb in backlog, even if we are
1782          * above the limits.
1783          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784          */
1785         th = (const struct tcphdr *)skb->data;
1786         hdrlen = th->doff * 4;
1787         shinfo = skb_shinfo(skb);
1788
1789         if (!shinfo->gso_size)
1790                 shinfo->gso_size = skb->len - hdrlen;
1791
1792         if (!shinfo->gso_segs)
1793                 shinfo->gso_segs = 1;
1794
1795         tail = sk->sk_backlog.tail;
1796         if (!tail)
1797                 goto no_coalesce;
1798         thtail = (struct tcphdr *)tail->data;
1799
1800         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1801             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1802             ((TCP_SKB_CB(tail)->tcp_flags |
1803               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1804             !((TCP_SKB_CB(tail)->tcp_flags &
1805               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1806             ((TCP_SKB_CB(tail)->tcp_flags ^
1807               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1808 #ifdef CONFIG_TLS_DEVICE
1809             tail->decrypted != skb->decrypted ||
1810 #endif
1811             thtail->doff != th->doff ||
1812             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1813                 goto no_coalesce;
1814
1815         __skb_pull(skb, hdrlen);
1816         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818
1819                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821                         thtail->window = th->window;
1822                 }
1823
1824                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825                  * thtail->fin, so that the fast path in tcp_rcv_established()
1826                  * is not entered if we append a packet with a FIN.
1827                  * SYN, RST, URG are not present.
1828                  * ACK is set on both packets.
1829                  * PSH : we do not really care in TCP stack,
1830                  *       at least for 'GRO' packets.
1831                  */
1832                 thtail->fin |= th->fin;
1833                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834
1835                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1837                         tail->tstamp = skb->tstamp;
1838                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839                 }
1840
1841                 /* Not as strict as GRO. We only need to carry mss max value */
1842                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1843                                                  skb_shinfo(tail)->gso_size);
1844
1845                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1846                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1847
1848                 sk->sk_backlog.len += delta;
1849                 __NET_INC_STATS(sock_net(sk),
1850                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1851                 kfree_skb_partial(skb, fragstolen);
1852                 return false;
1853         }
1854         __skb_push(skb, hdrlen);
1855
1856 no_coalesce:
1857         /* Only socket owner can try to collapse/prune rx queues
1858          * to reduce memory overhead, so add a little headroom here.
1859          * Few sockets backlog are possibly concurrently non empty.
1860          */
1861         limit += 64*1024;
1862
1863         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1864                 bh_unlock_sock(sk);
1865                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1866                 return true;
1867         }
1868         return false;
1869 }
1870 EXPORT_SYMBOL(tcp_add_backlog);
1871
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873 {
1874         struct tcphdr *th = (struct tcphdr *)skb->data;
1875
1876         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877 }
1878 EXPORT_SYMBOL(tcp_filter);
1879
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1881 {
1882         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883                 sizeof(struct inet_skb_parm));
1884 }
1885
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887                            const struct tcphdr *th)
1888 {
1889         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890          * barrier() makes sure compiler wont play fool^Waliasing games.
1891          */
1892         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893                 sizeof(struct inet_skb_parm));
1894         barrier();
1895
1896         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898                                     skb->len - th->doff * 4);
1899         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903         TCP_SKB_CB(skb)->sacked  = 0;
1904         TCP_SKB_CB(skb)->has_rxtstamp =
1905                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1906 }
1907
1908 /*
1909  *      From tcp_input.c
1910  */
1911
1912 int tcp_v4_rcv(struct sk_buff *skb)
1913 {
1914         struct net *net = dev_net(skb->dev);
1915         struct sk_buff *skb_to_free;
1916         int sdif = inet_sdif(skb);
1917         int dif = inet_iif(skb);
1918         const struct iphdr *iph;
1919         const struct tcphdr *th;
1920         bool refcounted;
1921         struct sock *sk;
1922         int ret;
1923
1924         if (skb->pkt_type != PACKET_HOST)
1925                 goto discard_it;
1926
1927         /* Count it even if it's bad */
1928         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1929
1930         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1931                 goto discard_it;
1932
1933         th = (const struct tcphdr *)skb->data;
1934
1935         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1936                 goto bad_packet;
1937         if (!pskb_may_pull(skb, th->doff * 4))
1938                 goto discard_it;
1939
1940         /* An explanation is required here, I think.
1941          * Packet length and doff are validated by header prediction,
1942          * provided case of th->doff==0 is eliminated.
1943          * So, we defer the checks. */
1944
1945         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1946                 goto csum_error;
1947
1948         th = (const struct tcphdr *)skb->data;
1949         iph = ip_hdr(skb);
1950 lookup:
1951         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1952                                th->dest, sdif, &refcounted);
1953         if (!sk)
1954                 goto no_tcp_socket;
1955
1956 process:
1957         if (sk->sk_state == TCP_TIME_WAIT)
1958                 goto do_time_wait;
1959
1960         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1961                 struct request_sock *req = inet_reqsk(sk);
1962                 bool req_stolen = false;
1963                 struct sock *nsk;
1964
1965                 sk = req->rsk_listener;
1966                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1967                         sk_drops_add(sk, skb);
1968                         reqsk_put(req);
1969                         goto discard_it;
1970                 }
1971                 if (tcp_checksum_complete(skb)) {
1972                         reqsk_put(req);
1973                         goto csum_error;
1974                 }
1975                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1976                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1977                         goto lookup;
1978                 }
1979                 /* We own a reference on the listener, increase it again
1980                  * as we might lose it too soon.
1981                  */
1982                 sock_hold(sk);
1983                 refcounted = true;
1984                 nsk = NULL;
1985                 if (!tcp_filter(sk, skb)) {
1986                         th = (const struct tcphdr *)skb->data;
1987                         iph = ip_hdr(skb);
1988                         tcp_v4_fill_cb(skb, iph, th);
1989                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1990                 }
1991                 if (!nsk) {
1992                         reqsk_put(req);
1993                         if (req_stolen) {
1994                                 /* Another cpu got exclusive access to req
1995                                  * and created a full blown socket.
1996                                  * Try to feed this packet to this socket
1997                                  * instead of discarding it.
1998                                  */
1999                                 tcp_v4_restore_cb(skb);
2000                                 sock_put(sk);
2001                                 goto lookup;
2002                         }
2003                         goto discard_and_relse;
2004                 }
2005                 if (nsk == sk) {
2006                         reqsk_put(req);
2007                         tcp_v4_restore_cb(skb);
2008                 } else if (tcp_child_process(sk, nsk, skb)) {
2009                         tcp_v4_send_reset(nsk, skb);
2010                         goto discard_and_relse;
2011                 } else {
2012                         sock_put(sk);
2013                         return 0;
2014                 }
2015         }
2016         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2017                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2018                 goto discard_and_relse;
2019         }
2020
2021         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2022                 goto discard_and_relse;
2023
2024         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2025                 goto discard_and_relse;
2026
2027         nf_reset_ct(skb);
2028
2029         if (tcp_filter(sk, skb))
2030                 goto discard_and_relse;
2031         th = (const struct tcphdr *)skb->data;
2032         iph = ip_hdr(skb);
2033         tcp_v4_fill_cb(skb, iph, th);
2034
2035         skb->dev = NULL;
2036
2037         if (sk->sk_state == TCP_LISTEN) {
2038                 ret = tcp_v4_do_rcv(sk, skb);
2039                 goto put_and_return;
2040         }
2041
2042         sk_incoming_cpu_update(sk);
2043
2044         bh_lock_sock_nested(sk);
2045         tcp_segs_in(tcp_sk(sk), skb);
2046         ret = 0;
2047         if (!sock_owned_by_user(sk)) {
2048                 skb_to_free = sk->sk_rx_skb_cache;
2049                 sk->sk_rx_skb_cache = NULL;
2050                 ret = tcp_v4_do_rcv(sk, skb);
2051         } else {
2052                 if (tcp_add_backlog(sk, skb))
2053                         goto discard_and_relse;
2054                 skb_to_free = NULL;
2055         }
2056         bh_unlock_sock(sk);
2057         if (skb_to_free)
2058                 __kfree_skb(skb_to_free);
2059
2060 put_and_return:
2061         if (refcounted)
2062                 sock_put(sk);
2063
2064         return ret;
2065
2066 no_tcp_socket:
2067         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2068                 goto discard_it;
2069
2070         tcp_v4_fill_cb(skb, iph, th);
2071
2072         if (tcp_checksum_complete(skb)) {
2073 csum_error:
2074                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2075 bad_packet:
2076                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2077         } else {
2078                 tcp_v4_send_reset(NULL, skb);
2079         }
2080
2081 discard_it:
2082         /* Discard frame. */
2083         kfree_skb(skb);
2084         return 0;
2085
2086 discard_and_relse:
2087         sk_drops_add(sk, skb);
2088         if (refcounted)
2089                 sock_put(sk);
2090         goto discard_it;
2091
2092 do_time_wait:
2093         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2094                 inet_twsk_put(inet_twsk(sk));
2095                 goto discard_it;
2096         }
2097
2098         tcp_v4_fill_cb(skb, iph, th);
2099
2100         if (tcp_checksum_complete(skb)) {
2101                 inet_twsk_put(inet_twsk(sk));
2102                 goto csum_error;
2103         }
2104         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2105         case TCP_TW_SYN: {
2106                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2107                                                         &tcp_hashinfo, skb,
2108                                                         __tcp_hdrlen(th),
2109                                                         iph->saddr, th->source,
2110                                                         iph->daddr, th->dest,
2111                                                         inet_iif(skb),
2112                                                         sdif);
2113                 if (sk2) {
2114                         inet_twsk_deschedule_put(inet_twsk(sk));
2115                         sk = sk2;
2116                         tcp_v4_restore_cb(skb);
2117                         refcounted = false;
2118                         goto process;
2119                 }
2120         }
2121                 /* to ACK */
2122                 fallthrough;
2123         case TCP_TW_ACK:
2124                 tcp_v4_timewait_ack(sk, skb);
2125                 break;
2126         case TCP_TW_RST:
2127                 tcp_v4_send_reset(sk, skb);
2128                 inet_twsk_deschedule_put(inet_twsk(sk));
2129                 goto discard_it;
2130         case TCP_TW_SUCCESS:;
2131         }
2132         goto discard_it;
2133 }
2134
2135 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2136         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2137         .twsk_unique    = tcp_twsk_unique,
2138         .twsk_destructor= tcp_twsk_destructor,
2139 };
2140
2141 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2142 {
2143         struct dst_entry *dst = skb_dst(skb);
2144
2145         if (dst && dst_hold_safe(dst)) {
2146                 sk->sk_rx_dst = dst;
2147                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2148         }
2149 }
2150 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2151
2152 const struct inet_connection_sock_af_ops ipv4_specific = {
2153         .queue_xmit        = ip_queue_xmit,
2154         .send_check        = tcp_v4_send_check,
2155         .rebuild_header    = inet_sk_rebuild_header,
2156         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2157         .conn_request      = tcp_v4_conn_request,
2158         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2159         .net_header_len    = sizeof(struct iphdr),
2160         .setsockopt        = ip_setsockopt,
2161         .getsockopt        = ip_getsockopt,
2162         .addr2sockaddr     = inet_csk_addr2sockaddr,
2163         .sockaddr_len      = sizeof(struct sockaddr_in),
2164         .mtu_reduced       = tcp_v4_mtu_reduced,
2165 };
2166 EXPORT_SYMBOL(ipv4_specific);
2167
2168 #ifdef CONFIG_TCP_MD5SIG
2169 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2170         .md5_lookup             = tcp_v4_md5_lookup,
2171         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2172         .md5_parse              = tcp_v4_parse_md5_keys,
2173 };
2174 #endif
2175
2176 /* NOTE: A lot of things set to zero explicitly by call to
2177  *       sk_alloc() so need not be done here.
2178  */
2179 static int tcp_v4_init_sock(struct sock *sk)
2180 {
2181         struct inet_connection_sock *icsk = inet_csk(sk);
2182
2183         tcp_init_sock(sk);
2184
2185         icsk->icsk_af_ops = &ipv4_specific;
2186
2187 #ifdef CONFIG_TCP_MD5SIG
2188         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2189 #endif
2190
2191         return 0;
2192 }
2193
2194 void tcp_v4_destroy_sock(struct sock *sk)
2195 {
2196         struct tcp_sock *tp = tcp_sk(sk);
2197
2198         trace_tcp_destroy_sock(sk);
2199
2200         tcp_clear_xmit_timers(sk);
2201
2202         tcp_cleanup_congestion_control(sk);
2203
2204         tcp_cleanup_ulp(sk);
2205
2206         /* Cleanup up the write buffer. */
2207         tcp_write_queue_purge(sk);
2208
2209         /* Check if we want to disable active TFO */
2210         tcp_fastopen_active_disable_ofo_check(sk);
2211
2212         /* Cleans up our, hopefully empty, out_of_order_queue. */
2213         skb_rbtree_purge(&tp->out_of_order_queue);
2214
2215 #ifdef CONFIG_TCP_MD5SIG
2216         /* Clean up the MD5 key list, if any */
2217         if (tp->md5sig_info) {
2218                 tcp_clear_md5_list(sk);
2219                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2220                 tp->md5sig_info = NULL;
2221         }
2222 #endif
2223
2224         /* Clean up a referenced TCP bind bucket. */
2225         if (inet_csk(sk)->icsk_bind_hash)
2226                 inet_put_port(sk);
2227
2228         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2229
2230         /* If socket is aborted during connect operation */
2231         tcp_free_fastopen_req(tp);
2232         tcp_fastopen_destroy_cipher(sk);
2233         tcp_saved_syn_free(tp);
2234
2235         sk_sockets_allocated_dec(sk);
2236 }
2237 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2238
2239 #ifdef CONFIG_PROC_FS
2240 /* Proc filesystem TCP sock list dumping. */
2241
2242 /*
2243  * Get next listener socket follow cur.  If cur is NULL, get first socket
2244  * starting from bucket given in st->bucket; when st->bucket is zero the
2245  * very first socket in the hash table is returned.
2246  */
2247 static void *listening_get_next(struct seq_file *seq, void *cur)
2248 {
2249         struct tcp_seq_afinfo *afinfo;
2250         struct tcp_iter_state *st = seq->private;
2251         struct net *net = seq_file_net(seq);
2252         struct inet_listen_hashbucket *ilb;
2253         struct hlist_nulls_node *node;
2254         struct sock *sk = cur;
2255
2256         if (st->bpf_seq_afinfo)
2257                 afinfo = st->bpf_seq_afinfo;
2258         else
2259                 afinfo = PDE_DATA(file_inode(seq->file));
2260
2261         if (!sk) {
2262 get_head:
2263                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2264                 spin_lock(&ilb->lock);
2265                 sk = sk_nulls_head(&ilb->nulls_head);
2266                 st->offset = 0;
2267                 goto get_sk;
2268         }
2269         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2270         ++st->num;
2271         ++st->offset;
2272
2273         sk = sk_nulls_next(sk);
2274 get_sk:
2275         sk_nulls_for_each_from(sk, node) {
2276                 if (!net_eq(sock_net(sk), net))
2277                         continue;
2278                 if (afinfo->family == AF_UNSPEC ||
2279                     sk->sk_family == afinfo->family)
2280                         return sk;
2281         }
2282         spin_unlock(&ilb->lock);
2283         st->offset = 0;
2284         if (++st->bucket < INET_LHTABLE_SIZE)
2285                 goto get_head;
2286         return NULL;
2287 }
2288
2289 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2290 {
2291         struct tcp_iter_state *st = seq->private;
2292         void *rc;
2293
2294         st->bucket = 0;
2295         st->offset = 0;
2296         rc = listening_get_next(seq, NULL);
2297
2298         while (rc && *pos) {
2299                 rc = listening_get_next(seq, rc);
2300                 --*pos;
2301         }
2302         return rc;
2303 }
2304
2305 static inline bool empty_bucket(const struct tcp_iter_state *st)
2306 {
2307         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2308 }
2309
2310 /*
2311  * Get first established socket starting from bucket given in st->bucket.
2312  * If st->bucket is zero, the very first socket in the hash is returned.
2313  */
2314 static void *established_get_first(struct seq_file *seq)
2315 {
2316         struct tcp_seq_afinfo *afinfo;
2317         struct tcp_iter_state *st = seq->private;
2318         struct net *net = seq_file_net(seq);
2319         void *rc = NULL;
2320
2321         if (st->bpf_seq_afinfo)
2322                 afinfo = st->bpf_seq_afinfo;
2323         else
2324                 afinfo = PDE_DATA(file_inode(seq->file));
2325
2326         st->offset = 0;
2327         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2328                 struct sock *sk;
2329                 struct hlist_nulls_node *node;
2330                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2331
2332                 /* Lockless fast path for the common case of empty buckets */
2333                 if (empty_bucket(st))
2334                         continue;
2335
2336                 spin_lock_bh(lock);
2337                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2338                         if ((afinfo->family != AF_UNSPEC &&
2339                              sk->sk_family != afinfo->family) ||
2340                             !net_eq(sock_net(sk), net)) {
2341                                 continue;
2342                         }
2343                         rc = sk;
2344                         goto out;
2345                 }
2346                 spin_unlock_bh(lock);
2347         }
2348 out:
2349         return rc;
2350 }
2351
2352 static void *established_get_next(struct seq_file *seq, void *cur)
2353 {
2354         struct tcp_seq_afinfo *afinfo;
2355         struct sock *sk = cur;
2356         struct hlist_nulls_node *node;
2357         struct tcp_iter_state *st = seq->private;
2358         struct net *net = seq_file_net(seq);
2359
2360         if (st->bpf_seq_afinfo)
2361                 afinfo = st->bpf_seq_afinfo;
2362         else
2363                 afinfo = PDE_DATA(file_inode(seq->file));
2364
2365         ++st->num;
2366         ++st->offset;
2367
2368         sk = sk_nulls_next(sk);
2369
2370         sk_nulls_for_each_from(sk, node) {
2371                 if ((afinfo->family == AF_UNSPEC ||
2372                      sk->sk_family == afinfo->family) &&
2373                     net_eq(sock_net(sk), net))
2374                         return sk;
2375         }
2376
2377         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2378         ++st->bucket;
2379         return established_get_first(seq);
2380 }
2381
2382 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2383 {
2384         struct tcp_iter_state *st = seq->private;
2385         void *rc;
2386
2387         st->bucket = 0;
2388         rc = established_get_first(seq);
2389
2390         while (rc && pos) {
2391                 rc = established_get_next(seq, rc);
2392                 --pos;
2393         }
2394         return rc;
2395 }
2396
2397 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2398 {
2399         void *rc;
2400         struct tcp_iter_state *st = seq->private;
2401
2402         st->state = TCP_SEQ_STATE_LISTENING;
2403         rc        = listening_get_idx(seq, &pos);
2404
2405         if (!rc) {
2406                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2407                 rc        = established_get_idx(seq, pos);
2408         }
2409
2410         return rc;
2411 }
2412
2413 static void *tcp_seek_last_pos(struct seq_file *seq)
2414 {
2415         struct tcp_iter_state *st = seq->private;
2416         int offset = st->offset;
2417         int orig_num = st->num;
2418         void *rc = NULL;
2419
2420         switch (st->state) {
2421         case TCP_SEQ_STATE_LISTENING:
2422                 if (st->bucket >= INET_LHTABLE_SIZE)
2423                         break;
2424                 st->state = TCP_SEQ_STATE_LISTENING;
2425                 rc = listening_get_next(seq, NULL);
2426                 while (offset-- && rc)
2427                         rc = listening_get_next(seq, rc);
2428                 if (rc)
2429                         break;
2430                 st->bucket = 0;
2431                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2432                 fallthrough;
2433         case TCP_SEQ_STATE_ESTABLISHED:
2434                 if (st->bucket > tcp_hashinfo.ehash_mask)
2435                         break;
2436                 rc = established_get_first(seq);
2437                 while (offset-- && rc)
2438                         rc = established_get_next(seq, rc);
2439         }
2440
2441         st->num = orig_num;
2442
2443         return rc;
2444 }
2445
2446 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2447 {
2448         struct tcp_iter_state *st = seq->private;
2449         void *rc;
2450
2451         if (*pos && *pos == st->last_pos) {
2452                 rc = tcp_seek_last_pos(seq);
2453                 if (rc)
2454                         goto out;
2455         }
2456
2457         st->state = TCP_SEQ_STATE_LISTENING;
2458         st->num = 0;
2459         st->bucket = 0;
2460         st->offset = 0;
2461         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2462
2463 out:
2464         st->last_pos = *pos;
2465         return rc;
2466 }
2467 EXPORT_SYMBOL(tcp_seq_start);
2468
2469 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2470 {
2471         struct tcp_iter_state *st = seq->private;
2472         void *rc = NULL;
2473
2474         if (v == SEQ_START_TOKEN) {
2475                 rc = tcp_get_idx(seq, 0);
2476                 goto out;
2477         }
2478
2479         switch (st->state) {
2480         case TCP_SEQ_STATE_LISTENING:
2481                 rc = listening_get_next(seq, v);
2482                 if (!rc) {
2483                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2484                         st->bucket = 0;
2485                         st->offset = 0;
2486                         rc        = established_get_first(seq);
2487                 }
2488                 break;
2489         case TCP_SEQ_STATE_ESTABLISHED:
2490                 rc = established_get_next(seq, v);
2491                 break;
2492         }
2493 out:
2494         ++*pos;
2495         st->last_pos = *pos;
2496         return rc;
2497 }
2498 EXPORT_SYMBOL(tcp_seq_next);
2499
2500 void tcp_seq_stop(struct seq_file *seq, void *v)
2501 {
2502         struct tcp_iter_state *st = seq->private;
2503
2504         switch (st->state) {
2505         case TCP_SEQ_STATE_LISTENING:
2506                 if (v != SEQ_START_TOKEN)
2507                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2508                 break;
2509         case TCP_SEQ_STATE_ESTABLISHED:
2510                 if (v)
2511                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2512                 break;
2513         }
2514 }
2515 EXPORT_SYMBOL(tcp_seq_stop);
2516
2517 static void get_openreq4(const struct request_sock *req,
2518                          struct seq_file *f, int i)
2519 {
2520         const struct inet_request_sock *ireq = inet_rsk(req);
2521         long delta = req->rsk_timer.expires - jiffies;
2522
2523         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2524                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2525                 i,
2526                 ireq->ir_loc_addr,
2527                 ireq->ir_num,
2528                 ireq->ir_rmt_addr,
2529                 ntohs(ireq->ir_rmt_port),
2530                 TCP_SYN_RECV,
2531                 0, 0, /* could print option size, but that is af dependent. */
2532                 1,    /* timers active (only the expire timer) */
2533                 jiffies_delta_to_clock_t(delta),
2534                 req->num_timeout,
2535                 from_kuid_munged(seq_user_ns(f),
2536                                  sock_i_uid(req->rsk_listener)),
2537                 0,  /* non standard timer */
2538                 0, /* open_requests have no inode */
2539                 0,
2540                 req);
2541 }
2542
2543 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2544 {
2545         int timer_active;
2546         unsigned long timer_expires;
2547         const struct tcp_sock *tp = tcp_sk(sk);
2548         const struct inet_connection_sock *icsk = inet_csk(sk);
2549         const struct inet_sock *inet = inet_sk(sk);
2550         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2551         __be32 dest = inet->inet_daddr;
2552         __be32 src = inet->inet_rcv_saddr;
2553         __u16 destp = ntohs(inet->inet_dport);
2554         __u16 srcp = ntohs(inet->inet_sport);
2555         int rx_queue;
2556         int state;
2557
2558         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2559             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2560             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2561                 timer_active    = 1;
2562                 timer_expires   = icsk->icsk_timeout;
2563         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2564                 timer_active    = 4;
2565                 timer_expires   = icsk->icsk_timeout;
2566         } else if (timer_pending(&sk->sk_timer)) {
2567                 timer_active    = 2;
2568                 timer_expires   = sk->sk_timer.expires;
2569         } else {
2570                 timer_active    = 0;
2571                 timer_expires = jiffies;
2572         }
2573
2574         state = inet_sk_state_load(sk);
2575         if (state == TCP_LISTEN)
2576                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2577         else
2578                 /* Because we don't lock the socket,
2579                  * we might find a transient negative value.
2580                  */
2581                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2582                                       READ_ONCE(tp->copied_seq), 0);
2583
2584         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2585                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2586                 i, src, srcp, dest, destp, state,
2587                 READ_ONCE(tp->write_seq) - tp->snd_una,
2588                 rx_queue,
2589                 timer_active,
2590                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2591                 icsk->icsk_retransmits,
2592                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2593                 icsk->icsk_probes_out,
2594                 sock_i_ino(sk),
2595                 refcount_read(&sk->sk_refcnt), sk,
2596                 jiffies_to_clock_t(icsk->icsk_rto),
2597                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2598                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2599                 tp->snd_cwnd,
2600                 state == TCP_LISTEN ?
2601                     fastopenq->max_qlen :
2602                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2603 }
2604
2605 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2606                                struct seq_file *f, int i)
2607 {
2608         long delta = tw->tw_timer.expires - jiffies;
2609         __be32 dest, src;
2610         __u16 destp, srcp;
2611
2612         dest  = tw->tw_daddr;
2613         src   = tw->tw_rcv_saddr;
2614         destp = ntohs(tw->tw_dport);
2615         srcp  = ntohs(tw->tw_sport);
2616
2617         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2618                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2619                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2620                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2621                 refcount_read(&tw->tw_refcnt), tw);
2622 }
2623
2624 #define TMPSZ 150
2625
2626 static int tcp4_seq_show(struct seq_file *seq, void *v)
2627 {
2628         struct tcp_iter_state *st;
2629         struct sock *sk = v;
2630
2631         seq_setwidth(seq, TMPSZ - 1);
2632         if (v == SEQ_START_TOKEN) {
2633                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2634                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2635                            "inode");
2636                 goto out;
2637         }
2638         st = seq->private;
2639
2640         if (sk->sk_state == TCP_TIME_WAIT)
2641                 get_timewait4_sock(v, seq, st->num);
2642         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2643                 get_openreq4(v, seq, st->num);
2644         else
2645                 get_tcp4_sock(v, seq, st->num);
2646 out:
2647         seq_pad(seq, '\n');
2648         return 0;
2649 }
2650
2651 #ifdef CONFIG_BPF_SYSCALL
2652 struct bpf_iter__tcp {
2653         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2654         __bpf_md_ptr(struct sock_common *, sk_common);
2655         uid_t uid __aligned(8);
2656 };
2657
2658 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2659                              struct sock_common *sk_common, uid_t uid)
2660 {
2661         struct bpf_iter__tcp ctx;
2662
2663         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2664         ctx.meta = meta;
2665         ctx.sk_common = sk_common;
2666         ctx.uid = uid;
2667         return bpf_iter_run_prog(prog, &ctx);
2668 }
2669
2670 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2671 {
2672         struct bpf_iter_meta meta;
2673         struct bpf_prog *prog;
2674         struct sock *sk = v;
2675         uid_t uid;
2676
2677         if (v == SEQ_START_TOKEN)
2678                 return 0;
2679
2680         if (sk->sk_state == TCP_TIME_WAIT) {
2681                 uid = 0;
2682         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2683                 const struct request_sock *req = v;
2684
2685                 uid = from_kuid_munged(seq_user_ns(seq),
2686                                        sock_i_uid(req->rsk_listener));
2687         } else {
2688                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2689         }
2690
2691         meta.seq = seq;
2692         prog = bpf_iter_get_info(&meta, false);
2693         return tcp_prog_seq_show(prog, &meta, v, uid);
2694 }
2695
2696 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2697 {
2698         struct bpf_iter_meta meta;
2699         struct bpf_prog *prog;
2700
2701         if (!v) {
2702                 meta.seq = seq;
2703                 prog = bpf_iter_get_info(&meta, true);
2704                 if (prog)
2705                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2706         }
2707
2708         tcp_seq_stop(seq, v);
2709 }
2710
2711 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2712         .show           = bpf_iter_tcp_seq_show,
2713         .start          = tcp_seq_start,
2714         .next           = tcp_seq_next,
2715         .stop           = bpf_iter_tcp_seq_stop,
2716 };
2717 #endif
2718
2719 static const struct seq_operations tcp4_seq_ops = {
2720         .show           = tcp4_seq_show,
2721         .start          = tcp_seq_start,
2722         .next           = tcp_seq_next,
2723         .stop           = tcp_seq_stop,
2724 };
2725
2726 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2727         .family         = AF_INET,
2728 };
2729
2730 static int __net_init tcp4_proc_init_net(struct net *net)
2731 {
2732         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2733                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2734                 return -ENOMEM;
2735         return 0;
2736 }
2737
2738 static void __net_exit tcp4_proc_exit_net(struct net *net)
2739 {
2740         remove_proc_entry("tcp", net->proc_net);
2741 }
2742
2743 static struct pernet_operations tcp4_net_ops = {
2744         .init = tcp4_proc_init_net,
2745         .exit = tcp4_proc_exit_net,
2746 };
2747
2748 int __init tcp4_proc_init(void)
2749 {
2750         return register_pernet_subsys(&tcp4_net_ops);
2751 }
2752
2753 void tcp4_proc_exit(void)
2754 {
2755         unregister_pernet_subsys(&tcp4_net_ops);
2756 }
2757 #endif /* CONFIG_PROC_FS */
2758
2759 struct proto tcp_prot = {
2760         .name                   = "TCP",
2761         .owner                  = THIS_MODULE,
2762         .close                  = tcp_close,
2763         .pre_connect            = tcp_v4_pre_connect,
2764         .connect                = tcp_v4_connect,
2765         .disconnect             = tcp_disconnect,
2766         .accept                 = inet_csk_accept,
2767         .ioctl                  = tcp_ioctl,
2768         .init                   = tcp_v4_init_sock,
2769         .destroy                = tcp_v4_destroy_sock,
2770         .shutdown               = tcp_shutdown,
2771         .setsockopt             = tcp_setsockopt,
2772         .getsockopt             = tcp_getsockopt,
2773         .keepalive              = tcp_set_keepalive,
2774         .recvmsg                = tcp_recvmsg,
2775         .sendmsg                = tcp_sendmsg,
2776         .sendpage               = tcp_sendpage,
2777         .backlog_rcv            = tcp_v4_do_rcv,
2778         .release_cb             = tcp_release_cb,
2779         .hash                   = inet_hash,
2780         .unhash                 = inet_unhash,
2781         .get_port               = inet_csk_get_port,
2782         .enter_memory_pressure  = tcp_enter_memory_pressure,
2783         .leave_memory_pressure  = tcp_leave_memory_pressure,
2784         .stream_memory_free     = tcp_stream_memory_free,
2785         .sockets_allocated      = &tcp_sockets_allocated,
2786         .orphan_count           = &tcp_orphan_count,
2787         .memory_allocated       = &tcp_memory_allocated,
2788         .memory_pressure        = &tcp_memory_pressure,
2789         .sysctl_mem             = sysctl_tcp_mem,
2790         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2791         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2792         .max_header             = MAX_TCP_HEADER,
2793         .obj_size               = sizeof(struct tcp_sock),
2794         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2795         .twsk_prot              = &tcp_timewait_sock_ops,
2796         .rsk_prot               = &tcp_request_sock_ops,
2797         .h.hashinfo             = &tcp_hashinfo,
2798         .no_autobind            = true,
2799         .diag_destroy           = tcp_abort,
2800 };
2801 EXPORT_SYMBOL(tcp_prot);
2802
2803 static void __net_exit tcp_sk_exit(struct net *net)
2804 {
2805         int cpu;
2806
2807         if (net->ipv4.tcp_congestion_control)
2808                 bpf_module_put(net->ipv4.tcp_congestion_control,
2809                                net->ipv4.tcp_congestion_control->owner);
2810
2811         for_each_possible_cpu(cpu)
2812                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2813         free_percpu(net->ipv4.tcp_sk);
2814 }
2815
2816 static int __net_init tcp_sk_init(struct net *net)
2817 {
2818         int res, cpu, cnt;
2819
2820         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2821         if (!net->ipv4.tcp_sk)
2822                 return -ENOMEM;
2823
2824         for_each_possible_cpu(cpu) {
2825                 struct sock *sk;
2826
2827                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2828                                            IPPROTO_TCP, net);
2829                 if (res)
2830                         goto fail;
2831                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2832
2833                 /* Please enforce IP_DF and IPID==0 for RST and
2834                  * ACK sent in SYN-RECV and TIME-WAIT state.
2835                  */
2836                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2837
2838                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2839         }
2840
2841         net->ipv4.sysctl_tcp_ecn = 2;
2842         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2843
2844         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2845         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2846         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2847         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2848         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2849
2850         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2851         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2852         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2853
2854         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2855         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2856         net->ipv4.sysctl_tcp_syncookies = 1;
2857         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2858         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2859         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2860         net->ipv4.sysctl_tcp_orphan_retries = 0;
2861         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2862         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2863         net->ipv4.sysctl_tcp_tw_reuse = 2;
2864         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2865
2866         cnt = tcp_hashinfo.ehash_mask + 1;
2867         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2868         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2869
2870         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2871         net->ipv4.sysctl_tcp_sack = 1;
2872         net->ipv4.sysctl_tcp_window_scaling = 1;
2873         net->ipv4.sysctl_tcp_timestamps = 1;
2874         net->ipv4.sysctl_tcp_early_retrans = 3;
2875         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2876         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2877         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2878         net->ipv4.sysctl_tcp_max_reordering = 300;
2879         net->ipv4.sysctl_tcp_dsack = 1;
2880         net->ipv4.sysctl_tcp_app_win = 31;
2881         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2882         net->ipv4.sysctl_tcp_frto = 2;
2883         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2884         /* This limits the percentage of the congestion window which we
2885          * will allow a single TSO frame to consume.  Building TSO frames
2886          * which are too large can cause TCP streams to be bursty.
2887          */
2888         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2889         /* Default TSQ limit of 16 TSO segments */
2890         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2891         /* rfc5961 challenge ack rate limiting */
2892         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2893         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2894         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2895         net->ipv4.sysctl_tcp_autocorking = 1;
2896         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2897         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2898         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2899         if (net != &init_net) {
2900                 memcpy(net->ipv4.sysctl_tcp_rmem,
2901                        init_net.ipv4.sysctl_tcp_rmem,
2902                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2903                 memcpy(net->ipv4.sysctl_tcp_wmem,
2904                        init_net.ipv4.sysctl_tcp_wmem,
2905                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2906         }
2907         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2908         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2909         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2910         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2911         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2912         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2913         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2914
2915         /* Reno is always built in */
2916         if (!net_eq(net, &init_net) &&
2917             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2918                                init_net.ipv4.tcp_congestion_control->owner))
2919                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2920         else
2921                 net->ipv4.tcp_congestion_control = &tcp_reno;
2922
2923         return 0;
2924 fail:
2925         tcp_sk_exit(net);
2926
2927         return res;
2928 }
2929
2930 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2931 {
2932         struct net *net;
2933
2934         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2935
2936         list_for_each_entry(net, net_exit_list, exit_list)
2937                 tcp_fastopen_ctx_destroy(net);
2938 }
2939
2940 static struct pernet_operations __net_initdata tcp_sk_ops = {
2941        .init       = tcp_sk_init,
2942        .exit       = tcp_sk_exit,
2943        .exit_batch = tcp_sk_exit_batch,
2944 };
2945
2946 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2947 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2948                      struct sock_common *sk_common, uid_t uid)
2949
2950 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2951 {
2952         struct tcp_iter_state *st = priv_data;
2953         struct tcp_seq_afinfo *afinfo;
2954         int ret;
2955
2956         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2957         if (!afinfo)
2958                 return -ENOMEM;
2959
2960         afinfo->family = AF_UNSPEC;
2961         st->bpf_seq_afinfo = afinfo;
2962         ret = bpf_iter_init_seq_net(priv_data, aux);
2963         if (ret)
2964                 kfree(afinfo);
2965         return ret;
2966 }
2967
2968 static void bpf_iter_fini_tcp(void *priv_data)
2969 {
2970         struct tcp_iter_state *st = priv_data;
2971
2972         kfree(st->bpf_seq_afinfo);
2973         bpf_iter_fini_seq_net(priv_data);
2974 }
2975
2976 static const struct bpf_iter_seq_info tcp_seq_info = {
2977         .seq_ops                = &bpf_iter_tcp_seq_ops,
2978         .init_seq_private       = bpf_iter_init_tcp,
2979         .fini_seq_private       = bpf_iter_fini_tcp,
2980         .seq_priv_size          = sizeof(struct tcp_iter_state),
2981 };
2982
2983 static struct bpf_iter_reg tcp_reg_info = {
2984         .target                 = "tcp",
2985         .ctx_arg_info_size      = 1,
2986         .ctx_arg_info           = {
2987                 { offsetof(struct bpf_iter__tcp, sk_common),
2988                   PTR_TO_BTF_ID_OR_NULL },
2989         },
2990         .seq_info               = &tcp_seq_info,
2991 };
2992
2993 static void __init bpf_iter_register(void)
2994 {
2995         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2996         if (bpf_iter_reg_target(&tcp_reg_info))
2997                 pr_warn("Warning: could not register bpf iterator tcp\n");
2998 }
2999
3000 #endif
3001
3002 void __init tcp_v4_init(void)
3003 {
3004         if (register_pernet_subsys(&tcp_sk_ops))
3005                 panic("Failed to create the TCP control socket.\n");
3006
3007 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3008         bpf_iter_register();
3009 #endif
3010 }