net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         WRITE_ONCE(tp->mtu_info, info);
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 #ifdef CONFIG_TCP_MD5SIG
 659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 660 #else
 661 #define OPTION_BYTES sizeof(__be32)
 662 #endif
 663
 664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 665 {
 666         const struct tcphdr *th = tcp_hdr(skb);
 667         struct {
 668                 struct tcphdr th;
 669                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 670         } rep;
 671         struct ip_reply_arg arg;
 672 #ifdef CONFIG_TCP_MD5SIG
 673         struct tcp_md5sig_key *key = NULL;
 674         const __u8 *hash_location = NULL;
 675         unsigned char newhash[16];
 676         int genhash;
 677         struct sock *sk1 = NULL;
 678 #endif
 679         u64 transmit_time = 0;
 680         struct sock *ctl_sk;
 681         struct net *net;
 682
 683         /* Never send a reset in response to a reset. */
 684         if (th->rst)
 685                 return;
 686
 687         /* If sk not NULL, it means we did a successful lookup and incoming
 688          * route had to be correct. prequeue might have dropped our dst.
 689          */
 690         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 691                 return;
 692
 693         /* Swap the send and the receive. */
 694         memset(&rep, 0, sizeof(rep));
 695         rep.th.dest   = th->source;
 696         rep.th.source = th->dest;
 697         rep.th.doff   = sizeof(struct tcphdr) / 4;
 698         rep.th.rst    = 1;
 699
 700         if (th->ack) {
 701                 rep.th.seq = th->ack_seq;
 702         } else {
 703                 rep.th.ack = 1;
 704                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 705                                        skb->len - (th->doff << 2));
 706         }
 707
 708         memset(&arg, 0, sizeof(arg));
 709         arg.iov[0].iov_base = (unsigned char *)&rep;
 710         arg.iov[0].iov_len  = sizeof(rep.th);
 711
 712         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 713 #ifdef CONFIG_TCP_MD5SIG
 714         rcu_read_lock();
 715         hash_location = tcp_parse_md5sig_option(th);
 716         if (sk && sk_fullsock(sk)) {
 717                 const union tcp_md5_addr *addr;
 718                 int l3index;
 719
 720                 /* sdif set, means packet ingressed via a device
 721                  * in an L3 domain and inet_iif is set to it.
 722                  */
 723                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 724                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 725                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 726         } else if (hash_location) {
 727                 const union tcp_md5_addr *addr;
 728                 int sdif = tcp_v4_sdif(skb);
 729                 int dif = inet_iif(skb);
 730                 int l3index;
 731
 732                 /*
 733                  * active side is lost. Try to find listening socket through
 734                  * source port, and then find md5 key through listening socket.
 735                  * we are not loose security here:
 736                  * Incoming packet is checked with md5 hash with finding key,
 737                  * no RST generated if md5 hash doesn't match.
 738                  */
 739                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 740                                              ip_hdr(skb)->saddr,
 741                                              th->source, ip_hdr(skb)->daddr,
 742                                              ntohs(th->source), dif, sdif);
 743                 /* don't send rst if it can't find key */
 744                 if (!sk1)
 745                         goto out;
 746
 747                 /* sdif set, means packet ingressed via a device
 748                  * in an L3 domain and dif is set to it.
 749                  */
 750                 l3index = sdif ? dif : 0;
 751                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 752                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 753                 if (!key)
 754                         goto out;
 755
 756
 757                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 758                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 759                         goto out;
 760
 761         }
 762
 763         if (key) {
 764                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 765                                    (TCPOPT_NOP << 16) |
 766                                    (TCPOPT_MD5SIG << 8) |
 767                                    TCPOLEN_MD5SIG);
 768                 /* Update length and the length the header thinks exists */
 769                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 770                 rep.th.doff = arg.iov[0].iov_len / 4;
 771
 772                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 773                                      key, ip_hdr(skb)->saddr,
 774                                      ip_hdr(skb)->daddr, &rep.th);
 775         }
 776 #endif
 777         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 778         if (rep.opt[0] == 0) {
 779                 __be32 mrst = mptcp_reset_option(skb);
 780
 781                 if (mrst) {
 782                         rep.opt[0] = mrst;
 783                         arg.iov[0].iov_len += sizeof(mrst);
 784                         rep.th.doff = arg.iov[0].iov_len / 4;
 785                 }
 786         }
 787
 788         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 789                                       ip_hdr(skb)->saddr, /* XXX */
 790                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 791         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 792         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 793
 794         /* When socket is gone, all binding information is lost.
 795          * routing might fail in this case. No choice here, if we choose to force
 796          * input interface, we will misroute in case of asymmetric route.
 797          */
 798         if (sk) {
 799                 arg.bound_dev_if = sk->sk_bound_dev_if;
 800                 if (sk_fullsock(sk))
 801                         trace_tcp_send_reset(sk, skb);
 802         }
 803
 804         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 805                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 806
 807         arg.tos = ip_hdr(skb)->tos;
 808         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 809         local_bh_disable();
 810         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 811         if (sk) {
 812                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 813                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 814                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 815                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 816                 transmit_time = tcp_transmit_time(sk);
 817         }
 818         ip_send_unicast_reply(ctl_sk,
 819                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 820                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 821                               &arg, arg.iov[0].iov_len,
 822                               transmit_time);
 823
 824         ctl_sk->sk_mark = 0;
 825         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 826         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 827         local_bh_enable();
 828
 829 #ifdef CONFIG_TCP_MD5SIG
 830 out:
 831         rcu_read_unlock();
 832 #endif
 833 }
 834
 835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 836    outside socket context is ugly, certainly. What can I do?
 837  */
 838
 839 static void tcp_v4_send_ack(const struct sock *sk,
 840                             struct sk_buff *skb, u32 seq, u32 ack,
 841                             u32 win, u32 tsval, u32 tsecr, int oif,
 842                             struct tcp_md5sig_key *key,
 843                             int reply_flags, u8 tos)
 844 {
 845         const struct tcphdr *th = tcp_hdr(skb);
 846         struct {
 847                 struct tcphdr th;
 848                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 849 #ifdef CONFIG_TCP_MD5SIG
 850                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 851 #endif
 852                         ];
 853         } rep;
 854         struct net *net = sock_net(sk);
 855         struct ip_reply_arg arg;
 856         struct sock *ctl_sk;
 857         u64 transmit_time;
 858
 859         memset(&rep.th, 0, sizeof(struct tcphdr));
 860         memset(&arg, 0, sizeof(arg));
 861
 862         arg.iov[0].iov_base = (unsigned char *)&rep;
 863         arg.iov[0].iov_len  = sizeof(rep.th);
 864         if (tsecr) {
 865                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 866                                    (TCPOPT_TIMESTAMP << 8) |
 867                                    TCPOLEN_TIMESTAMP);
 868                 rep.opt[1] = htonl(tsval);
 869                 rep.opt[2] = htonl(tsecr);
 870                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 871         }
 872
 873         /* Swap the send and the receive. */
 874         rep.th.dest    = th->source;
 875         rep.th.source  = th->dest;
 876         rep.th.doff    = arg.iov[0].iov_len / 4;
 877         rep.th.seq     = htonl(seq);
 878         rep.th.ack_seq = htonl(ack);
 879         rep.th.ack     = 1;
 880         rep.th.window  = htons(win);
 881
 882 #ifdef CONFIG_TCP_MD5SIG
 883         if (key) {
 884                 int offset = (tsecr) ? 3 : 0;
 885
 886                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 887                                           (TCPOPT_NOP << 16) |
 888                                           (TCPOPT_MD5SIG << 8) |
 889                                           TCPOLEN_MD5SIG);
 890                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 891                 rep.th.doff = arg.iov[0].iov_len/4;
 892
 893                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 894                                     key, ip_hdr(skb)->saddr,
 895                                     ip_hdr(skb)->daddr, &rep.th);
 896         }
 897 #endif
 898         arg.flags = reply_flags;
 899         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 900                                       ip_hdr(skb)->saddr, /* XXX */
 901                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 902         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 903         if (oif)
 904                 arg.bound_dev_if = oif;
 905         arg.tos = tos;
 906         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 907         local_bh_disable();
 908         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 909         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 910                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 911         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 912                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 913         transmit_time = tcp_transmit_time(sk);
 914         ip_send_unicast_reply(ctl_sk,
 915                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 916                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 917                               &arg, arg.iov[0].iov_len,
 918                               transmit_time);
 919
 920         ctl_sk->sk_mark = 0;
 921         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 922         local_bh_enable();
 923 }
 924
 925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 926 {
 927         struct inet_timewait_sock *tw = inet_twsk(sk);
 928         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 929
 930         tcp_v4_send_ack(sk, skb,
 931                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 932                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 933                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 934                         tcptw->tw_ts_recent,
 935                         tw->tw_bound_dev_if,
 936                         tcp_twsk_md5_key(tcptw),
 937                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 938                         tw->tw_tos
 939                         );
 940
 941         inet_twsk_put(tw);
 942 }
 943
 944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 945                                   struct request_sock *req)
 946 {
 947         const union tcp_md5_addr *addr;
 948         int l3index;
 949
 950         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 951          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 952          */
 953         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 954                                              tcp_sk(sk)->snd_nxt;
 955
 956         /* RFC 7323 2.3
 957          * The window field (SEG.WND) of every outgoing segment, with the
 958          * exception of <SYN> segments, MUST be right-shifted by
 959          * Rcv.Wind.Shift bits:
 960          */
 961         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 962         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 963         tcp_v4_send_ack(sk, skb, seq,
 964                         tcp_rsk(req)->rcv_nxt,
 965                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 966                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 967                         req->ts_recent,
 968                         0,
 969                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 970                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 971                         ip_hdr(skb)->tos);
 972 }
 973
 974 /*
 975  *      Send a SYN-ACK after having received a SYN.
 976  *      This still operates on a request_sock only, not on a big
 977  *      socket.
 978  */
 979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 980                               struct flowi *fl,
 981                               struct request_sock *req,
 982                               struct tcp_fastopen_cookie *foc,
 983                               enum tcp_synack_type synack_type,
 984                               struct sk_buff *syn_skb)
 985 {
 986         const struct inet_request_sock *ireq = inet_rsk(req);
 987         struct flowi4 fl4;
 988         int err = -1;
 989         struct sk_buff *skb;
 990         u8 tos;
 991
 992         /* First, grab a route. */
 993         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 994                 return -1;
 995
 996         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 997
 998         if (skb) {
 999                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004                                 inet_sk(sk)->tos;
1005
1006                 if (!INET_ECN_is_capable(tos) &&
1007                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1008                         tos |= INET_ECN_ECT_0;
1009
1010                 rcu_read_lock();
1011                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012                                             ireq->ir_rmt_addr,
1013                                             rcu_dereference(ireq->ireq_opt),
1014                                             tos);
1015                 rcu_read_unlock();
1016                 err = net_xmit_eval(err);
1017         }
1018
1019         return err;
1020 }
1021
1022 /*
1023  *      IPv4 request_sock destructor.
1024  */
1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026 {
1027         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028 }
1029
1030 #ifdef CONFIG_TCP_MD5SIG
1031 /*
1032  * RFC2385 MD5 checksumming requires a mapping of
1033  * IP address->MD5 Key.
1034  * We need to maintain these in the sk structure.
1035  */
1036
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1039
1040 /* Find the Key structure for an address.  */
1041 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1042                                            const union tcp_md5_addr *addr,
1043                                            int family)
1044 {
1045         const struct tcp_sock *tp = tcp_sk(sk);
1046         struct tcp_md5sig_key *key;
1047         const struct tcp_md5sig_info *md5sig;
1048         __be32 mask;
1049         struct tcp_md5sig_key *best_match = NULL;
1050         bool match;
1051
1052         /* caller either holds rcu_read_lock() or socket lock */
1053         md5sig = rcu_dereference_check(tp->md5sig_info,
1054                                        lockdep_sock_is_held(sk));
1055         if (!md5sig)
1056                 return NULL;
1057
1058         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1059                                  lockdep_sock_is_held(sk)) {
1060                 if (key->family != family)
1061                         continue;
1062                 if (key->l3index && key->l3index != l3index)
1063                         continue;
1064                 if (family == AF_INET) {
1065                         mask = inet_make_mask(key->prefixlen);
1066                         match = (key->addr.a4.s_addr & mask) ==
1067                                 (addr->a4.s_addr & mask);
1068 #if IS_ENABLED(CONFIG_IPV6)
1069                 } else if (family == AF_INET6) {
1070                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1071                                                   key->prefixlen);
1072 #endif
1073                 } else {
1074                         match = false;
1075                 }
1076
1077                 if (match && (!best_match ||
1078                               key->prefixlen > best_match->prefixlen))
1079                         best_match = key;
1080         }
1081         return best_match;
1082 }
1083 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084
1085 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086                                                       const union tcp_md5_addr *addr,
1087                                                       int family, u8 prefixlen,
1088                                                       int l3index)
1089 {
1090         const struct tcp_sock *tp = tcp_sk(sk);
1091         struct tcp_md5sig_key *key;
1092         unsigned int size = sizeof(struct in_addr);
1093         const struct tcp_md5sig_info *md5sig;
1094
1095         /* caller either holds rcu_read_lock() or socket lock */
1096         md5sig = rcu_dereference_check(tp->md5sig_info,
1097                                        lockdep_sock_is_held(sk));
1098         if (!md5sig)
1099                 return NULL;
1100 #if IS_ENABLED(CONFIG_IPV6)
1101         if (family == AF_INET6)
1102                 size = sizeof(struct in6_addr);
1103 #endif
1104         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105                                  lockdep_sock_is_held(sk)) {
1106                 if (key->family != family)
1107                         continue;
1108                 if (key->l3index && key->l3index != l3index)
1109                         continue;
1110                 if (!memcmp(&key->addr, addr, size) &&
1111                     key->prefixlen == prefixlen)
1112                         return key;
1113         }
1114         return NULL;
1115 }
1116
1117 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118                                          const struct sock *addr_sk)
1119 {
1120         const union tcp_md5_addr *addr;
1121         int l3index;
1122
1123         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124                                                  addr_sk->sk_bound_dev_if);
1125         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129
1130 /* This can be called on a newly created socket, from other files */
1131 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132                    int family, u8 prefixlen, int l3index,
1133                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134 {
1135         /* Add Key to the list */
1136         struct tcp_md5sig_key *key;
1137         struct tcp_sock *tp = tcp_sk(sk);
1138         struct tcp_md5sig_info *md5sig;
1139
1140         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141         if (key) {
1142                 /* Pre-existing entry - just update that one.
1143                  * Note that the key might be used concurrently.
1144                  * data_race() is telling kcsan that we do not care of
1145                  * key mismatches, since changing MD5 key on live flows
1146                  * can lead to packet drops.
1147                  */
1148                 data_race(memcpy(key->key, newkey, newkeylen));
1149
1150                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1151                  * Also note that a reader could catch new key->keylen value
1152                  * but old key->key[], this is the reason we use __GFP_ZERO
1153                  * at sock_kmalloc() time below these lines.
1154                  */
1155                 WRITE_ONCE(key->keylen, newkeylen);
1156
1157                 return 0;
1158         }
1159
1160         md5sig = rcu_dereference_protected(tp->md5sig_info,
1161                                            lockdep_sock_is_held(sk));
1162         if (!md5sig) {
1163                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1164                 if (!md5sig)
1165                         return -ENOMEM;
1166
1167                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168                 INIT_HLIST_HEAD(&md5sig->head);
1169                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1170         }
1171
1172         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173         if (!key)
1174                 return -ENOMEM;
1175         if (!tcp_alloc_md5sig_pool()) {
1176                 sock_kfree_s(sk, key, sizeof(*key));
1177                 return -ENOMEM;
1178         }
1179
1180         memcpy(key->key, newkey, newkeylen);
1181         key->keylen = newkeylen;
1182         key->family = family;
1183         key->prefixlen = prefixlen;
1184         key->l3index = l3index;
1185         memcpy(&key->addr, addr,
1186                (family == AF_INET6) ? sizeof(struct in6_addr) :
1187                                       sizeof(struct in_addr));
1188         hlist_add_head_rcu(&key->node, &md5sig->head);
1189         return 0;
1190 }
1191 EXPORT_SYMBOL(tcp_md5_do_add);
1192
1193 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194                    u8 prefixlen, int l3index)
1195 {
1196         struct tcp_md5sig_key *key;
1197
1198         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199         if (!key)
1200                 return -ENOENT;
1201         hlist_del_rcu(&key->node);
1202         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1203         kfree_rcu(key, rcu);
1204         return 0;
1205 }
1206 EXPORT_SYMBOL(tcp_md5_do_del);
1207
1208 static void tcp_clear_md5_list(struct sock *sk)
1209 {
1210         struct tcp_sock *tp = tcp_sk(sk);
1211         struct tcp_md5sig_key *key;
1212         struct hlist_node *n;
1213         struct tcp_md5sig_info *md5sig;
1214
1215         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216
1217         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1218                 hlist_del_rcu(&key->node);
1219                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1220                 kfree_rcu(key, rcu);
1221         }
1222 }
1223
1224 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225                                  sockptr_t optval, int optlen)
1226 {
1227         struct tcp_md5sig cmd;
1228         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229         const union tcp_md5_addr *addr;
1230         u8 prefixlen = 32;
1231         int l3index = 0;
1232
1233         if (optlen < sizeof(cmd))
1234                 return -EINVAL;
1235
1236         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237                 return -EFAULT;
1238
1239         if (sin->sin_family != AF_INET)
1240                 return -EINVAL;
1241
1242         if (optname == TCP_MD5SIG_EXT &&
1243             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244                 prefixlen = cmd.tcpm_prefixlen;
1245                 if (prefixlen > 32)
1246                         return -EINVAL;
1247         }
1248
1249         if (optname == TCP_MD5SIG_EXT &&
1250             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251                 struct net_device *dev;
1252
1253                 rcu_read_lock();
1254                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255                 if (dev && netif_is_l3_master(dev))
1256                         l3index = dev->ifindex;
1257
1258                 rcu_read_unlock();
1259
1260                 /* ok to reference set/not set outside of rcu;
1261                  * right now device MUST be an L3 master
1262                  */
1263                 if (!dev || !l3index)
1264                         return -EINVAL;
1265         }
1266
1267         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
1269         if (!cmd.tcpm_keylen)
1270                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271
1272         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273                 return -EINVAL;
1274
1275         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277 }
1278
1279 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280                                    __be32 daddr, __be32 saddr,
1281                                    const struct tcphdr *th, int nbytes)
1282 {
1283         struct tcp4_pseudohdr *bp;
1284         struct scatterlist sg;
1285         struct tcphdr *_th;
1286
1287         bp = hp->scratch;
1288         bp->saddr = saddr;
1289         bp->daddr = daddr;
1290         bp->pad = 0;
1291         bp->protocol = IPPROTO_TCP;
1292         bp->len = cpu_to_be16(nbytes);
1293
1294         _th = (struct tcphdr *)(bp + 1);
1295         memcpy(_th, th, sizeof(*th));
1296         _th->check = 0;
1297
1298         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300                                 sizeof(*bp) + sizeof(*th));
1301         return crypto_ahash_update(hp->md5_req);
1302 }
1303
1304 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1305                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306 {
1307         struct tcp_md5sig_pool *hp;
1308         struct ahash_request *req;
1309
1310         hp = tcp_get_md5sig_pool();
1311         if (!hp)
1312                 goto clear_hash_noput;
1313         req = hp->md5_req;
1314
1315         if (crypto_ahash_init(req))
1316                 goto clear_hash;
1317         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318                 goto clear_hash;
1319         if (tcp_md5_hash_key(hp, key))
1320                 goto clear_hash;
1321         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322         if (crypto_ahash_final(req))
1323                 goto clear_hash;
1324
1325         tcp_put_md5sig_pool();
1326         return 0;
1327
1328 clear_hash:
1329         tcp_put_md5sig_pool();
1330 clear_hash_noput:
1331         memset(md5_hash, 0, 16);
1332         return 1;
1333 }
1334
1335 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336                         const struct sock *sk,
1337                         const struct sk_buff *skb)
1338 {
1339         struct tcp_md5sig_pool *hp;
1340         struct ahash_request *req;
1341         const struct tcphdr *th = tcp_hdr(skb);
1342         __be32 saddr, daddr;
1343
1344         if (sk) { /* valid for establish/request sockets */
1345                 saddr = sk->sk_rcv_saddr;
1346                 daddr = sk->sk_daddr;
1347         } else {
1348                 const struct iphdr *iph = ip_hdr(skb);
1349                 saddr = iph->saddr;
1350                 daddr = iph->daddr;
1351         }
1352
1353         hp = tcp_get_md5sig_pool();
1354         if (!hp)
1355                 goto clear_hash_noput;
1356         req = hp->md5_req;
1357
1358         if (crypto_ahash_init(req))
1359                 goto clear_hash;
1360
1361         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362                 goto clear_hash;
1363         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364                 goto clear_hash;
1365         if (tcp_md5_hash_key(hp, key))
1366                 goto clear_hash;
1367         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368         if (crypto_ahash_final(req))
1369                 goto clear_hash;
1370
1371         tcp_put_md5sig_pool();
1372         return 0;
1373
1374 clear_hash:
1375         tcp_put_md5sig_pool();
1376 clear_hash_noput:
1377         memset(md5_hash, 0, 16);
1378         return 1;
1379 }
1380 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381
1382 #endif
1383
1384 /* Called with rcu_read_lock() */
1385 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386                                     const struct sk_buff *skb,
1387                                     int dif, int sdif)
1388 {
1389 #ifdef CONFIG_TCP_MD5SIG
1390         /*
1391          * This gets called for each TCP segment that arrives
1392          * so we want to be efficient.
1393          * We have 3 drop cases:
1394          * o No MD5 hash and one expected.
1395          * o MD5 hash and we're not expecting one.
1396          * o MD5 hash and its wrong.
1397          */
1398         const __u8 *hash_location = NULL;
1399         struct tcp_md5sig_key *hash_expected;
1400         const struct iphdr *iph = ip_hdr(skb);
1401         const struct tcphdr *th = tcp_hdr(skb);
1402         const union tcp_md5_addr *addr;
1403         unsigned char newhash[16];
1404         int genhash, l3index;
1405
1406         /* sdif set, means packet ingressed via a device
1407          * in an L3 domain and dif is set to the l3mdev
1408          */
1409         l3index = sdif ? dif : 0;
1410
1411         addr = (union tcp_md5_addr *)&iph->saddr;
1412         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413         hash_location = tcp_parse_md5sig_option(th);
1414
1415         /* We've parsed the options - do we have a hash? */
1416         if (!hash_expected && !hash_location)
1417                 return false;
1418
1419         if (hash_expected && !hash_location) {
1420                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1421                 return true;
1422         }
1423
1424         if (!hash_expected && hash_location) {
1425                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1426                 return true;
1427         }
1428
1429         /* Okay, so this is hash_expected and hash_location -
1430          * so we need to calculate the checksum.
1431          */
1432         genhash = tcp_v4_md5_hash_skb(newhash,
1433                                       hash_expected,
1434                                       NULL, skb);
1435
1436         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439                                      &iph->saddr, ntohs(th->source),
1440                                      &iph->daddr, ntohs(th->dest),
1441                                      genhash ? " tcp_v4_calc_md5_hash failed"
1442                                      : "", l3index);
1443                 return true;
1444         }
1445         return false;
1446 #endif
1447         return false;
1448 }
1449
1450 static void tcp_v4_init_req(struct request_sock *req,
1451                             const struct sock *sk_listener,
1452                             struct sk_buff *skb)
1453 {
1454         struct inet_request_sock *ireq = inet_rsk(req);
1455         struct net *net = sock_net(sk_listener);
1456
1457         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1459         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460 }
1461
1462 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463                                           struct sk_buff *skb,
1464                                           struct flowi *fl,
1465                                           struct request_sock *req)
1466 {
1467         tcp_v4_init_req(req, sk, skb);
1468
1469         if (security_inet_conn_request(sk, skb, req))
1470                 return NULL;
1471
1472         return inet_csk_route_req(sk, &fl->u.ip4, req);
1473 }
1474
1475 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1476         .family         =       PF_INET,
1477         .obj_size       =       sizeof(struct tcp_request_sock),
1478         .rtx_syn_ack    =       tcp_rtx_synack,
1479         .send_ack       =       tcp_v4_reqsk_send_ack,
1480         .destructor     =       tcp_v4_reqsk_destructor,
1481         .send_reset     =       tcp_v4_send_reset,
1482         .syn_ack_timeout =      tcp_syn_ack_timeout,
1483 };
1484
1485 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1486         .mss_clamp      =       TCP_MSS_DEFAULT,
1487 #ifdef CONFIG_TCP_MD5SIG
1488         .req_md5_lookup =       tcp_v4_md5_lookup,
1489         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1490 #endif
1491 #ifdef CONFIG_SYN_COOKIES
1492         .cookie_init_seq =      cookie_v4_init_sequence,
1493 #endif
1494         .route_req      =       tcp_v4_route_req,
1495         .init_seq       =       tcp_v4_init_seq,
1496         .init_ts_off    =       tcp_v4_init_ts_off,
1497         .send_synack    =       tcp_v4_send_synack,
1498 };
1499
1500 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1501 {
1502         /* Never answer to SYNs send to broadcast or multicast */
1503         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1504                 goto drop;
1505
1506         return tcp_conn_request(&tcp_request_sock_ops,
1507                                 &tcp_request_sock_ipv4_ops, sk, skb);
1508
1509 drop:
1510         tcp_listendrop(sk);
1511         return 0;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_conn_request);
1514
1515
1516 /*
1517  * The three way handshake has completed - we got a valid synack -
1518  * now create the new socket.
1519  */
1520 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1521                                   struct request_sock *req,
1522                                   struct dst_entry *dst,
1523                                   struct request_sock *req_unhash,
1524                                   bool *own_req)
1525 {
1526         struct inet_request_sock *ireq;
1527         bool found_dup_sk = false;
1528         struct inet_sock *newinet;
1529         struct tcp_sock *newtp;
1530         struct sock *newsk;
1531 #ifdef CONFIG_TCP_MD5SIG
1532         const union tcp_md5_addr *addr;
1533         struct tcp_md5sig_key *key;
1534         int l3index;
1535 #endif
1536         struct ip_options_rcu *inet_opt;
1537
1538         if (sk_acceptq_is_full(sk))
1539                 goto exit_overflow;
1540
1541         newsk = tcp_create_openreq_child(sk, req, skb);
1542         if (!newsk)
1543                 goto exit_nonewsk;
1544
1545         newsk->sk_gso_type = SKB_GSO_TCPV4;
1546         inet_sk_rx_dst_set(newsk, skb);
1547
1548         newtp                 = tcp_sk(newsk);
1549         newinet               = inet_sk(newsk);
1550         ireq                  = inet_rsk(req);
1551         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1552         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1553         newsk->sk_bound_dev_if = ireq->ir_iif;
1554         newinet->inet_saddr   = ireq->ir_loc_addr;
1555         inet_opt              = rcu_dereference(ireq->ireq_opt);
1556         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1557         newinet->mc_index     = inet_iif(skb);
1558         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1559         newinet->rcv_tos      = ip_hdr(skb)->tos;
1560         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1561         if (inet_opt)
1562                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1563         newinet->inet_id = prandom_u32();
1564
1565         /* Set ToS of the new socket based upon the value of incoming SYN.
1566          * ECT bits are set later in tcp_init_transfer().
1567          */
1568         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1569                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1570
1571         if (!dst) {
1572                 dst = inet_csk_route_child_sock(sk, newsk, req);
1573                 if (!dst)
1574                         goto put_and_exit;
1575         } else {
1576                 /* syncookie case : see end of cookie_v4_check() */
1577         }
1578         sk_setup_caps(newsk, dst);
1579
1580         tcp_ca_openreq_child(newsk, dst);
1581
1582         tcp_sync_mss(newsk, dst_mtu(dst));
1583         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1584
1585         tcp_initialize_rcv_mss(newsk);
1586
1587 #ifdef CONFIG_TCP_MD5SIG
1588         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1589         /* Copy over the MD5 key from the original socket */
1590         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1591         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1592         if (key) {
1593                 /*
1594                  * We're using one, so create a matching key
1595                  * on the newsk structure. If we fail to get
1596                  * memory, then we end up not copying the key
1597                  * across. Shucks.
1598                  */
1599                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1600                                key->key, key->keylen, GFP_ATOMIC);
1601                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1602         }
1603 #endif
1604
1605         if (__inet_inherit_port(sk, newsk) < 0)
1606                 goto put_and_exit;
1607         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1608                                        &found_dup_sk);
1609         if (likely(*own_req)) {
1610                 tcp_move_syn(newtp, req);
1611                 ireq->ireq_opt = NULL;
1612         } else {
1613                 newinet->inet_opt = NULL;
1614
1615                 if (!req_unhash && found_dup_sk) {
1616                         /* This code path should only be executed in the
1617                          * syncookie case only
1618                          */
1619                         bh_unlock_sock(newsk);
1620                         sock_put(newsk);
1621                         newsk = NULL;
1622                 }
1623         }
1624         return newsk;
1625
1626 exit_overflow:
1627         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1628 exit_nonewsk:
1629         dst_release(dst);
1630 exit:
1631         tcp_listendrop(sk);
1632         return NULL;
1633 put_and_exit:
1634         newinet->inet_opt = NULL;
1635         inet_csk_prepare_forced_close(newsk);
1636         tcp_done(newsk);
1637         goto exit;
1638 }
1639 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1640
1641 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1642 {
1643 #ifdef CONFIG_SYN_COOKIES
1644         const struct tcphdr *th = tcp_hdr(skb);
1645
1646         if (!th->syn)
1647                 sk = cookie_v4_check(sk, skb);
1648 #endif
1649         return sk;
1650 }
1651
1652 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1653                          struct tcphdr *th, u32 *cookie)
1654 {
1655         u16 mss = 0;
1656 #ifdef CONFIG_SYN_COOKIES
1657         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1658                                     &tcp_request_sock_ipv4_ops, sk, th);
1659         if (mss) {
1660                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1661                 tcp_synq_overflow(sk);
1662         }
1663 #endif
1664         return mss;
1665 }
1666
1667 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1668                                                            u32));
1669 /* The socket must have it's spinlock held when we get
1670  * here, unless it is a TCP_LISTEN socket.
1671  *
1672  * We have a potential double-lock case here, so even when
1673  * doing backlog processing we use the BH locking scheme.
1674  * This is because we cannot sleep with the original spinlock
1675  * held.
1676  */
1677 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1678 {
1679         struct sock *rsk;
1680
1681         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1682                 struct dst_entry *dst = sk->sk_rx_dst;
1683
1684                 sock_rps_save_rxhash(sk, skb);
1685                 sk_mark_napi_id(sk, skb);
1686                 if (dst) {
1687                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1688                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1689                                              dst, 0)) {
1690                                 dst_release(dst);
1691                                 sk->sk_rx_dst = NULL;
1692                         }
1693                 }
1694                 tcp_rcv_established(sk, skb);
1695                 return 0;
1696         }
1697
1698         if (tcp_checksum_complete(skb))
1699                 goto csum_err;
1700
1701         if (sk->sk_state == TCP_LISTEN) {
1702                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1703
1704                 if (!nsk)
1705                         goto discard;
1706                 if (nsk != sk) {
1707                         if (tcp_child_process(sk, nsk, skb)) {
1708                                 rsk = nsk;
1709                                 goto reset;
1710                         }
1711                         return 0;
1712                 }
1713         } else
1714                 sock_rps_save_rxhash(sk, skb);
1715
1716         if (tcp_rcv_state_process(sk, skb)) {
1717                 rsk = sk;
1718                 goto reset;
1719         }
1720         return 0;
1721
1722 reset:
1723         tcp_v4_send_reset(rsk, skb);
1724 discard:
1725         kfree_skb(skb);
1726         /* Be careful here. If this function gets more complicated and
1727          * gcc suffers from register pressure on the x86, sk (in %ebx)
1728          * might be destroyed here. This current version compiles correctly,
1729          * but you have been warned.
1730          */
1731         return 0;
1732
1733 csum_err:
1734         trace_tcp_bad_csum(skb);
1735         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1736         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1737         goto discard;
1738 }
1739 EXPORT_SYMBOL(tcp_v4_do_rcv);
1740
1741 int tcp_v4_early_demux(struct sk_buff *skb)
1742 {
1743         const struct iphdr *iph;
1744         const struct tcphdr *th;
1745         struct sock *sk;
1746
1747         if (skb->pkt_type != PACKET_HOST)
1748                 return 0;
1749
1750         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1751                 return 0;
1752
1753         iph = ip_hdr(skb);
1754         th = tcp_hdr(skb);
1755
1756         if (th->doff < sizeof(struct tcphdr) / 4)
1757                 return 0;
1758
1759         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1760                                        iph->saddr, th->source,
1761                                        iph->daddr, ntohs(th->dest),
1762                                        skb->skb_iif, inet_sdif(skb));
1763         if (sk) {
1764                 skb->sk = sk;
1765                 skb->destructor = sock_edemux;
1766                 if (sk_fullsock(sk)) {
1767                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1768
1769                         if (dst)
1770                                 dst = dst_check(dst, 0);
1771                         if (dst &&
1772                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1773                                 skb_dst_set_noref(skb, dst);
1774                 }
1775         }
1776         return 0;
1777 }
1778
1779 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1780 {
1781         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1782         u32 tail_gso_size, tail_gso_segs;
1783         struct skb_shared_info *shinfo;
1784         const struct tcphdr *th;
1785         struct tcphdr *thtail;
1786         struct sk_buff *tail;
1787         unsigned int hdrlen;
1788         bool fragstolen;
1789         u32 gso_segs;
1790         u32 gso_size;
1791         int delta;
1792
1793         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1794          * we can fix skb->truesize to its real value to avoid future drops.
1795          * This is valid because skb is not yet charged to the socket.
1796          * It has been noticed pure SACK packets were sometimes dropped
1797          * (if cooked by drivers without copybreak feature).
1798          */
1799         skb_condense(skb);
1800
1801         skb_dst_drop(skb);
1802
1803         if (unlikely(tcp_checksum_complete(skb))) {
1804                 bh_unlock_sock(sk);
1805                 trace_tcp_bad_csum(skb);
1806                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1807                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1808                 return true;
1809         }
1810
1811         /* Attempt coalescing to last skb in backlog, even if we are
1812          * above the limits.
1813          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1814          */
1815         th = (const struct tcphdr *)skb->data;
1816         hdrlen = th->doff * 4;
1817
1818         tail = sk->sk_backlog.tail;
1819         if (!tail)
1820                 goto no_coalesce;
1821         thtail = (struct tcphdr *)tail->data;
1822
1823         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1824             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1825             ((TCP_SKB_CB(tail)->tcp_flags |
1826               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1827             !((TCP_SKB_CB(tail)->tcp_flags &
1828               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1829             ((TCP_SKB_CB(tail)->tcp_flags ^
1830               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1831 #ifdef CONFIG_TLS_DEVICE
1832             tail->decrypted != skb->decrypted ||
1833 #endif
1834             thtail->doff != th->doff ||
1835             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1836                 goto no_coalesce;
1837
1838         __skb_pull(skb, hdrlen);
1839
1840         shinfo = skb_shinfo(skb);
1841         gso_size = shinfo->gso_size ?: skb->len;
1842         gso_segs = shinfo->gso_segs ?: 1;
1843
1844         shinfo = skb_shinfo(tail);
1845         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1846         tail_gso_segs = shinfo->gso_segs ?: 1;
1847
1848         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1849                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1850
1851                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1852                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1853                         thtail->window = th->window;
1854                 }
1855
1856                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1857                  * thtail->fin, so that the fast path in tcp_rcv_established()
1858                  * is not entered if we append a packet with a FIN.
1859                  * SYN, RST, URG are not present.
1860                  * ACK is set on both packets.
1861                  * PSH : we do not really care in TCP stack,
1862                  *       at least for 'GRO' packets.
1863                  */
1864                 thtail->fin |= th->fin;
1865                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1866
1867                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1868                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1869                         tail->tstamp = skb->tstamp;
1870                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1871                 }
1872
1873                 /* Not as strict as GRO. We only need to carry mss max value */
1874                 shinfo->gso_size = max(gso_size, tail_gso_size);
1875                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1876
1877                 sk->sk_backlog.len += delta;
1878                 __NET_INC_STATS(sock_net(sk),
1879                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1880                 kfree_skb_partial(skb, fragstolen);
1881                 return false;
1882         }
1883         __skb_push(skb, hdrlen);
1884
1885 no_coalesce:
1886         /* Only socket owner can try to collapse/prune rx queues
1887          * to reduce memory overhead, so add a little headroom here.
1888          * Few sockets backlog are possibly concurrently non empty.
1889          */
1890         limit += 64*1024;
1891
1892         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1893                 bh_unlock_sock(sk);
1894                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1895                 return true;
1896         }
1897         return false;
1898 }
1899 EXPORT_SYMBOL(tcp_add_backlog);
1900
1901 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1902 {
1903         struct tcphdr *th = (struct tcphdr *)skb->data;
1904
1905         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1906 }
1907 EXPORT_SYMBOL(tcp_filter);
1908
1909 static void tcp_v4_restore_cb(struct sk_buff *skb)
1910 {
1911         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1912                 sizeof(struct inet_skb_parm));
1913 }
1914
1915 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1916                            const struct tcphdr *th)
1917 {
1918         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1919          * barrier() makes sure compiler wont play fool^Waliasing games.
1920          */
1921         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1922                 sizeof(struct inet_skb_parm));
1923         barrier();
1924
1925         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1926         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1927                                     skb->len - th->doff * 4);
1928         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1929         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1930         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1931         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1932         TCP_SKB_CB(skb)->sacked  = 0;
1933         TCP_SKB_CB(skb)->has_rxtstamp =
1934                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1935 }
1936
1937 /*
1938  *      From tcp_input.c
1939  */
1940
1941 int tcp_v4_rcv(struct sk_buff *skb)
1942 {
1943         struct net *net = dev_net(skb->dev);
1944         struct sk_buff *skb_to_free;
1945         int sdif = inet_sdif(skb);
1946         int dif = inet_iif(skb);
1947         const struct iphdr *iph;
1948         const struct tcphdr *th;
1949         bool refcounted;
1950         struct sock *sk;
1951         int ret;
1952
1953         if (skb->pkt_type != PACKET_HOST)
1954                 goto discard_it;
1955
1956         /* Count it even if it's bad */
1957         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1958
1959         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1960                 goto discard_it;
1961
1962         th = (const struct tcphdr *)skb->data;
1963
1964         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1965                 goto bad_packet;
1966         if (!pskb_may_pull(skb, th->doff * 4))
1967                 goto discard_it;
1968
1969         /* An explanation is required here, I think.
1970          * Packet length and doff are validated by header prediction,
1971          * provided case of th->doff==0 is eliminated.
1972          * So, we defer the checks. */
1973
1974         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1975                 goto csum_error;
1976
1977         th = (const struct tcphdr *)skb->data;
1978         iph = ip_hdr(skb);
1979 lookup:
1980         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1981                                th->dest, sdif, &refcounted);
1982         if (!sk)
1983                 goto no_tcp_socket;
1984
1985 process:
1986         if (sk->sk_state == TCP_TIME_WAIT)
1987                 goto do_time_wait;
1988
1989         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990                 struct request_sock *req = inet_reqsk(sk);
1991                 bool req_stolen = false;
1992                 struct sock *nsk;
1993
1994                 sk = req->rsk_listener;
1995                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1996                         sk_drops_add(sk, skb);
1997                         reqsk_put(req);
1998                         goto discard_it;
1999                 }
2000                 if (tcp_checksum_complete(skb)) {
2001                         reqsk_put(req);
2002                         goto csum_error;
2003                 }
2004                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2005                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2006                         if (!nsk) {
2007                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2008                                 goto lookup;
2009                         }
2010                         sk = nsk;
2011                         /* reuseport_migrate_sock() has already held one sk_refcnt
2012                          * before returning.
2013                          */
2014                 } else {
2015                         /* We own a reference on the listener, increase it again
2016                          * as we might lose it too soon.
2017                          */
2018                         sock_hold(sk);
2019                 }
2020                 refcounted = true;
2021                 nsk = NULL;
2022                 if (!tcp_filter(sk, skb)) {
2023                         th = (const struct tcphdr *)skb->data;
2024                         iph = ip_hdr(skb);
2025                         tcp_v4_fill_cb(skb, iph, th);
2026                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2027                 }
2028                 if (!nsk) {
2029                         reqsk_put(req);
2030                         if (req_stolen) {
2031                                 /* Another cpu got exclusive access to req
2032                                  * and created a full blown socket.
2033                                  * Try to feed this packet to this socket
2034                                  * instead of discarding it.
2035                                  */
2036                                 tcp_v4_restore_cb(skb);
2037                                 sock_put(sk);
2038                                 goto lookup;
2039                         }
2040                         goto discard_and_relse;
2041                 }
2042                 if (nsk == sk) {
2043                         reqsk_put(req);
2044                         tcp_v4_restore_cb(skb);
2045                 } else if (tcp_child_process(sk, nsk, skb)) {
2046                         tcp_v4_send_reset(nsk, skb);
2047                         goto discard_and_relse;
2048                 } else {
2049                         sock_put(sk);
2050                         return 0;
2051                 }
2052         }
2053         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2054                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2055                 goto discard_and_relse;
2056         }
2057
2058         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2059                 goto discard_and_relse;
2060
2061         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2062                 goto discard_and_relse;
2063
2064         nf_reset_ct(skb);
2065
2066         if (tcp_filter(sk, skb))
2067                 goto discard_and_relse;
2068         th = (const struct tcphdr *)skb->data;
2069         iph = ip_hdr(skb);
2070         tcp_v4_fill_cb(skb, iph, th);
2071
2072         skb->dev = NULL;
2073
2074         if (sk->sk_state == TCP_LISTEN) {
2075                 ret = tcp_v4_do_rcv(sk, skb);
2076                 goto put_and_return;
2077         }
2078
2079         sk_incoming_cpu_update(sk);
2080
2081         bh_lock_sock_nested(sk);
2082         tcp_segs_in(tcp_sk(sk), skb);
2083         ret = 0;
2084         if (!sock_owned_by_user(sk)) {
2085                 skb_to_free = sk->sk_rx_skb_cache;
2086                 sk->sk_rx_skb_cache = NULL;
2087                 ret = tcp_v4_do_rcv(sk, skb);
2088         } else {
2089                 if (tcp_add_backlog(sk, skb))
2090                         goto discard_and_relse;
2091                 skb_to_free = NULL;
2092         }
2093         bh_unlock_sock(sk);
2094         if (skb_to_free)
2095                 __kfree_skb(skb_to_free);
2096
2097 put_and_return:
2098         if (refcounted)
2099                 sock_put(sk);
2100
2101         return ret;
2102
2103 no_tcp_socket:
2104         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2105                 goto discard_it;
2106
2107         tcp_v4_fill_cb(skb, iph, th);
2108
2109         if (tcp_checksum_complete(skb)) {
2110 csum_error:
2111                 trace_tcp_bad_csum(skb);
2112                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2113 bad_packet:
2114                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2115         } else {
2116                 tcp_v4_send_reset(NULL, skb);
2117         }
2118
2119 discard_it:
2120         /* Discard frame. */
2121         kfree_skb(skb);
2122         return 0;
2123
2124 discard_and_relse:
2125         sk_drops_add(sk, skb);
2126         if (refcounted)
2127                 sock_put(sk);
2128         goto discard_it;
2129
2130 do_time_wait:
2131         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2132                 inet_twsk_put(inet_twsk(sk));
2133                 goto discard_it;
2134         }
2135
2136         tcp_v4_fill_cb(skb, iph, th);
2137
2138         if (tcp_checksum_complete(skb)) {
2139                 inet_twsk_put(inet_twsk(sk));
2140                 goto csum_error;
2141         }
2142         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2143         case TCP_TW_SYN: {
2144                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2145                                                         &tcp_hashinfo, skb,
2146                                                         __tcp_hdrlen(th),
2147                                                         iph->saddr, th->source,
2148                                                         iph->daddr, th->dest,
2149                                                         inet_iif(skb),
2150                                                         sdif);
2151                 if (sk2) {
2152                         inet_twsk_deschedule_put(inet_twsk(sk));
2153                         sk = sk2;
2154                         tcp_v4_restore_cb(skb);
2155                         refcounted = false;
2156                         goto process;
2157                 }
2158         }
2159                 /* to ACK */
2160                 fallthrough;
2161         case TCP_TW_ACK:
2162                 tcp_v4_timewait_ack(sk, skb);
2163                 break;
2164         case TCP_TW_RST:
2165                 tcp_v4_send_reset(sk, skb);
2166                 inet_twsk_deschedule_put(inet_twsk(sk));
2167                 goto discard_it;
2168         case TCP_TW_SUCCESS:;
2169         }
2170         goto discard_it;
2171 }
2172
2173 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2174         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2175         .twsk_unique    = tcp_twsk_unique,
2176         .twsk_destructor= tcp_twsk_destructor,
2177 };
2178
2179 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2180 {
2181         struct dst_entry *dst = skb_dst(skb);
2182
2183         if (dst && dst_hold_safe(dst)) {
2184                 sk->sk_rx_dst = dst;
2185                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2186         }
2187 }
2188 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2189
2190 const struct inet_connection_sock_af_ops ipv4_specific = {
2191         .queue_xmit        = ip_queue_xmit,
2192         .send_check        = tcp_v4_send_check,
2193         .rebuild_header    = inet_sk_rebuild_header,
2194         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2195         .conn_request      = tcp_v4_conn_request,
2196         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2197         .net_header_len    = sizeof(struct iphdr),
2198         .setsockopt        = ip_setsockopt,
2199         .getsockopt        = ip_getsockopt,
2200         .addr2sockaddr     = inet_csk_addr2sockaddr,
2201         .sockaddr_len      = sizeof(struct sockaddr_in),
2202         .mtu_reduced       = tcp_v4_mtu_reduced,
2203 };
2204 EXPORT_SYMBOL(ipv4_specific);
2205
2206 #ifdef CONFIG_TCP_MD5SIG
2207 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2208         .md5_lookup             = tcp_v4_md5_lookup,
2209         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2210         .md5_parse              = tcp_v4_parse_md5_keys,
2211 };
2212 #endif
2213
2214 /* NOTE: A lot of things set to zero explicitly by call to
2215  *       sk_alloc() so need not be done here.
2216  */
2217 static int tcp_v4_init_sock(struct sock *sk)
2218 {
2219         struct inet_connection_sock *icsk = inet_csk(sk);
2220
2221         tcp_init_sock(sk);
2222
2223         icsk->icsk_af_ops = &ipv4_specific;
2224
2225 #ifdef CONFIG_TCP_MD5SIG
2226         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2227 #endif
2228
2229         return 0;
2230 }
2231
2232 void tcp_v4_destroy_sock(struct sock *sk)
2233 {
2234         struct tcp_sock *tp = tcp_sk(sk);
2235
2236         trace_tcp_destroy_sock(sk);
2237
2238         tcp_clear_xmit_timers(sk);
2239
2240         tcp_cleanup_congestion_control(sk);
2241
2242         tcp_cleanup_ulp(sk);
2243
2244         /* Cleanup up the write buffer. */
2245         tcp_write_queue_purge(sk);
2246
2247         /* Check if we want to disable active TFO */
2248         tcp_fastopen_active_disable_ofo_check(sk);
2249
2250         /* Cleans up our, hopefully empty, out_of_order_queue. */
2251         skb_rbtree_purge(&tp->out_of_order_queue);
2252
2253 #ifdef CONFIG_TCP_MD5SIG
2254         /* Clean up the MD5 key list, if any */
2255         if (tp->md5sig_info) {
2256                 tcp_clear_md5_list(sk);
2257                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2258                 tp->md5sig_info = NULL;
2259         }
2260 #endif
2261
2262         /* Clean up a referenced TCP bind bucket. */
2263         if (inet_csk(sk)->icsk_bind_hash)
2264                 inet_put_port(sk);
2265
2266         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2267
2268         /* If socket is aborted during connect operation */
2269         tcp_free_fastopen_req(tp);
2270         tcp_fastopen_destroy_cipher(sk);
2271         tcp_saved_syn_free(tp);
2272
2273         sk_sockets_allocated_dec(sk);
2274 }
2275 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2276
2277 #ifdef CONFIG_PROC_FS
2278 /* Proc filesystem TCP sock list dumping. */
2279
2280 static unsigned short seq_file_family(const struct seq_file *seq);
2281
2282 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2283 {
2284         unsigned short family = seq_file_family(seq);
2285
2286         /* AF_UNSPEC is used as a match all */
2287         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2288                 net_eq(sock_net(sk), seq_file_net(seq)));
2289 }
2290
2291 /* Find a non empty bucket (starting from st->bucket)
2292  * and return the first sk from it.
2293  */
2294 static void *listening_get_first(struct seq_file *seq)
2295 {
2296         struct tcp_iter_state *st = seq->private;
2297
2298         st->offset = 0;
2299         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2300                 struct inet_listen_hashbucket *ilb2;
2301                 struct inet_connection_sock *icsk;
2302                 struct sock *sk;
2303
2304                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2305                 if (hlist_empty(&ilb2->head))
2306                         continue;
2307
2308                 spin_lock(&ilb2->lock);
2309                 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2310                         sk = (struct sock *)icsk;
2311                         if (seq_sk_match(seq, sk))
2312                                 return sk;
2313                 }
2314                 spin_unlock(&ilb2->lock);
2315         }
2316
2317         return NULL;
2318 }
2319
2320 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2321  * If "cur" is the last one in the st->bucket,
2322  * call listening_get_first() to return the first sk of the next
2323  * non empty bucket.
2324  */
2325 static void *listening_get_next(struct seq_file *seq, void *cur)
2326 {
2327         struct tcp_iter_state *st = seq->private;
2328         struct inet_listen_hashbucket *ilb2;
2329         struct inet_connection_sock *icsk;
2330         struct sock *sk = cur;
2331
2332         ++st->num;
2333         ++st->offset;
2334
2335         icsk = inet_csk(sk);
2336         inet_lhash2_for_each_icsk_continue(icsk) {
2337                 sk = (struct sock *)icsk;
2338                 if (seq_sk_match(seq, sk))
2339                         return sk;
2340         }
2341
2342         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2343         spin_unlock(&ilb2->lock);
2344         ++st->bucket;
2345         return listening_get_first(seq);
2346 }
2347
2348 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2349 {
2350         struct tcp_iter_state *st = seq->private;
2351         void *rc;
2352
2353         st->bucket = 0;
2354         st->offset = 0;
2355         rc = listening_get_first(seq);
2356
2357         while (rc && *pos) {
2358                 rc = listening_get_next(seq, rc);
2359                 --*pos;
2360         }
2361         return rc;
2362 }
2363
2364 static inline bool empty_bucket(const struct tcp_iter_state *st)
2365 {
2366         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2367 }
2368
2369 /*
2370  * Get first established socket starting from bucket given in st->bucket.
2371  * If st->bucket is zero, the very first socket in the hash is returned.
2372  */
2373 static void *established_get_first(struct seq_file *seq)
2374 {
2375         struct tcp_iter_state *st = seq->private;
2376
2377         st->offset = 0;
2378         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2379                 struct sock *sk;
2380                 struct hlist_nulls_node *node;
2381                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2382
2383                 /* Lockless fast path for the common case of empty buckets */
2384                 if (empty_bucket(st))
2385                         continue;
2386
2387                 spin_lock_bh(lock);
2388                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2389                         if (seq_sk_match(seq, sk))
2390                                 return sk;
2391                 }
2392                 spin_unlock_bh(lock);
2393         }
2394
2395         return NULL;
2396 }
2397
2398 static void *established_get_next(struct seq_file *seq, void *cur)
2399 {
2400         struct sock *sk = cur;
2401         struct hlist_nulls_node *node;
2402         struct tcp_iter_state *st = seq->private;
2403
2404         ++st->num;
2405         ++st->offset;
2406
2407         sk = sk_nulls_next(sk);
2408
2409         sk_nulls_for_each_from(sk, node) {
2410                 if (seq_sk_match(seq, sk))
2411                         return sk;
2412         }
2413
2414         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2415         ++st->bucket;
2416         return established_get_first(seq);
2417 }
2418
2419 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2420 {
2421         struct tcp_iter_state *st = seq->private;
2422         void *rc;
2423
2424         st->bucket = 0;
2425         rc = established_get_first(seq);
2426
2427         while (rc && pos) {
2428                 rc = established_get_next(seq, rc);
2429                 --pos;
2430         }
2431         return rc;
2432 }
2433
2434 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2435 {
2436         void *rc;
2437         struct tcp_iter_state *st = seq->private;
2438
2439         st->state = TCP_SEQ_STATE_LISTENING;
2440         rc        = listening_get_idx(seq, &pos);
2441
2442         if (!rc) {
2443                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2444                 rc        = established_get_idx(seq, pos);
2445         }
2446
2447         return rc;
2448 }
2449
2450 static void *tcp_seek_last_pos(struct seq_file *seq)
2451 {
2452         struct tcp_iter_state *st = seq->private;
2453         int bucket = st->bucket;
2454         int offset = st->offset;
2455         int orig_num = st->num;
2456         void *rc = NULL;
2457
2458         switch (st->state) {
2459         case TCP_SEQ_STATE_LISTENING:
2460                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2461                         break;
2462                 st->state = TCP_SEQ_STATE_LISTENING;
2463                 rc = listening_get_first(seq);
2464                 while (offset-- && rc && bucket == st->bucket)
2465                         rc = listening_get_next(seq, rc);
2466                 if (rc)
2467                         break;
2468                 st->bucket = 0;
2469                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2470                 fallthrough;
2471         case TCP_SEQ_STATE_ESTABLISHED:
2472                 if (st->bucket > tcp_hashinfo.ehash_mask)
2473                         break;
2474                 rc = established_get_first(seq);
2475                 while (offset-- && rc && bucket == st->bucket)
2476                         rc = established_get_next(seq, rc);
2477         }
2478
2479         st->num = orig_num;
2480
2481         return rc;
2482 }
2483
2484 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2485 {
2486         struct tcp_iter_state *st = seq->private;
2487         void *rc;
2488
2489         if (*pos && *pos == st->last_pos) {
2490                 rc = tcp_seek_last_pos(seq);
2491                 if (rc)
2492                         goto out;
2493         }
2494
2495         st->state = TCP_SEQ_STATE_LISTENING;
2496         st->num = 0;
2497         st->bucket = 0;
2498         st->offset = 0;
2499         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2500
2501 out:
2502         st->last_pos = *pos;
2503         return rc;
2504 }
2505 EXPORT_SYMBOL(tcp_seq_start);
2506
2507 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2508 {
2509         struct tcp_iter_state *st = seq->private;
2510         void *rc = NULL;
2511
2512         if (v == SEQ_START_TOKEN) {
2513                 rc = tcp_get_idx(seq, 0);
2514                 goto out;
2515         }
2516
2517         switch (st->state) {
2518         case TCP_SEQ_STATE_LISTENING:
2519                 rc = listening_get_next(seq, v);
2520                 if (!rc) {
2521                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2522                         st->bucket = 0;
2523                         st->offset = 0;
2524                         rc        = established_get_first(seq);
2525                 }
2526                 break;
2527         case TCP_SEQ_STATE_ESTABLISHED:
2528                 rc = established_get_next(seq, v);
2529                 break;
2530         }
2531 out:
2532         ++*pos;
2533         st->last_pos = *pos;
2534         return rc;
2535 }
2536 EXPORT_SYMBOL(tcp_seq_next);
2537
2538 void tcp_seq_stop(struct seq_file *seq, void *v)
2539 {
2540         struct tcp_iter_state *st = seq->private;
2541
2542         switch (st->state) {
2543         case TCP_SEQ_STATE_LISTENING:
2544                 if (v != SEQ_START_TOKEN)
2545                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2546                 break;
2547         case TCP_SEQ_STATE_ESTABLISHED:
2548                 if (v)
2549                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2550                 break;
2551         }
2552 }
2553 EXPORT_SYMBOL(tcp_seq_stop);
2554
2555 static void get_openreq4(const struct request_sock *req,
2556                          struct seq_file *f, int i)
2557 {
2558         const struct inet_request_sock *ireq = inet_rsk(req);
2559         long delta = req->rsk_timer.expires - jiffies;
2560
2561         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2562                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2563                 i,
2564                 ireq->ir_loc_addr,
2565                 ireq->ir_num,
2566                 ireq->ir_rmt_addr,
2567                 ntohs(ireq->ir_rmt_port),
2568                 TCP_SYN_RECV,
2569                 0, 0, /* could print option size, but that is af dependent. */
2570                 1,    /* timers active (only the expire timer) */
2571                 jiffies_delta_to_clock_t(delta),
2572                 req->num_timeout,
2573                 from_kuid_munged(seq_user_ns(f),
2574                                  sock_i_uid(req->rsk_listener)),
2575                 0,  /* non standard timer */
2576                 0, /* open_requests have no inode */
2577                 0,
2578                 req);
2579 }
2580
2581 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2582 {
2583         int timer_active;
2584         unsigned long timer_expires;
2585         const struct tcp_sock *tp = tcp_sk(sk);
2586         const struct inet_connection_sock *icsk = inet_csk(sk);
2587         const struct inet_sock *inet = inet_sk(sk);
2588         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2589         __be32 dest = inet->inet_daddr;
2590         __be32 src = inet->inet_rcv_saddr;
2591         __u16 destp = ntohs(inet->inet_dport);
2592         __u16 srcp = ntohs(inet->inet_sport);
2593         int rx_queue;
2594         int state;
2595
2596         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2597             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2598             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2599                 timer_active    = 1;
2600                 timer_expires   = icsk->icsk_timeout;
2601         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2602                 timer_active    = 4;
2603                 timer_expires   = icsk->icsk_timeout;
2604         } else if (timer_pending(&sk->sk_timer)) {
2605                 timer_active    = 2;
2606                 timer_expires   = sk->sk_timer.expires;
2607         } else {
2608                 timer_active    = 0;
2609                 timer_expires = jiffies;
2610         }
2611
2612         state = inet_sk_state_load(sk);
2613         if (state == TCP_LISTEN)
2614                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2615         else
2616                 /* Because we don't lock the socket,
2617                  * we might find a transient negative value.
2618                  */
2619                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2620                                       READ_ONCE(tp->copied_seq), 0);
2621
2622         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2623                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2624                 i, src, srcp, dest, destp, state,
2625                 READ_ONCE(tp->write_seq) - tp->snd_una,
2626                 rx_queue,
2627                 timer_active,
2628                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2629                 icsk->icsk_retransmits,
2630                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2631                 icsk->icsk_probes_out,
2632                 sock_i_ino(sk),
2633                 refcount_read(&sk->sk_refcnt), sk,
2634                 jiffies_to_clock_t(icsk->icsk_rto),
2635                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2636                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2637                 tp->snd_cwnd,
2638                 state == TCP_LISTEN ?
2639                     fastopenq->max_qlen :
2640                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2641 }
2642
2643 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2644                                struct seq_file *f, int i)
2645 {
2646         long delta = tw->tw_timer.expires - jiffies;
2647         __be32 dest, src;
2648         __u16 destp, srcp;
2649
2650         dest  = tw->tw_daddr;
2651         src   = tw->tw_rcv_saddr;
2652         destp = ntohs(tw->tw_dport);
2653         srcp  = ntohs(tw->tw_sport);
2654
2655         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2656                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2657                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2658                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2659                 refcount_read(&tw->tw_refcnt), tw);
2660 }
2661
2662 #define TMPSZ 150
2663
2664 static int tcp4_seq_show(struct seq_file *seq, void *v)
2665 {
2666         struct tcp_iter_state *st;
2667         struct sock *sk = v;
2668
2669         seq_setwidth(seq, TMPSZ - 1);
2670         if (v == SEQ_START_TOKEN) {
2671                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2672                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2673                            "inode");
2674                 goto out;
2675         }
2676         st = seq->private;
2677
2678         if (sk->sk_state == TCP_TIME_WAIT)
2679                 get_timewait4_sock(v, seq, st->num);
2680         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2681                 get_openreq4(v, seq, st->num);
2682         else
2683                 get_tcp4_sock(v, seq, st->num);
2684 out:
2685         seq_pad(seq, '\n');
2686         return 0;
2687 }
2688
2689 #ifdef CONFIG_BPF_SYSCALL
2690 struct bpf_tcp_iter_state {
2691         struct tcp_iter_state state;
2692         unsigned int cur_sk;
2693         unsigned int end_sk;
2694         unsigned int max_sk;
2695         struct sock **batch;
2696         bool st_bucket_done;
2697 };
2698
2699 struct bpf_iter__tcp {
2700         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2701         __bpf_md_ptr(struct sock_common *, sk_common);
2702         uid_t uid __aligned(8);
2703 };
2704
2705 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2706                              struct sock_common *sk_common, uid_t uid)
2707 {
2708         struct bpf_iter__tcp ctx;
2709
2710         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2711         ctx.meta = meta;
2712         ctx.sk_common = sk_common;
2713         ctx.uid = uid;
2714         return bpf_iter_run_prog(prog, &ctx);
2715 }
2716
2717 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2718 {
2719         while (iter->cur_sk < iter->end_sk)
2720                 sock_put(iter->batch[iter->cur_sk++]);
2721 }
2722
2723 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2724                                       unsigned int new_batch_sz)
2725 {
2726         struct sock **new_batch;
2727
2728         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2729                              GFP_USER | __GFP_NOWARN);
2730         if (!new_batch)
2731                 return -ENOMEM;
2732
2733         bpf_iter_tcp_put_batch(iter);
2734         kvfree(iter->batch);
2735         iter->batch = new_batch;
2736         iter->max_sk = new_batch_sz;
2737
2738         return 0;
2739 }
2740
2741 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2742                                                  struct sock *start_sk)
2743 {
2744         struct bpf_tcp_iter_state *iter = seq->private;
2745         struct tcp_iter_state *st = &iter->state;
2746         struct inet_connection_sock *icsk;
2747         unsigned int expected = 1;
2748         struct sock *sk;
2749
2750         sock_hold(start_sk);
2751         iter->batch[iter->end_sk++] = start_sk;
2752
2753         icsk = inet_csk(start_sk);
2754         inet_lhash2_for_each_icsk_continue(icsk) {
2755                 sk = (struct sock *)icsk;
2756                 if (seq_sk_match(seq, sk)) {
2757                         if (iter->end_sk < iter->max_sk) {
2758                                 sock_hold(sk);
2759                                 iter->batch[iter->end_sk++] = sk;
2760                         }
2761                         expected++;
2762                 }
2763         }
2764         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2765
2766         return expected;
2767 }
2768
2769 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2770                                                    struct sock *start_sk)
2771 {
2772         struct bpf_tcp_iter_state *iter = seq->private;
2773         struct tcp_iter_state *st = &iter->state;
2774         struct hlist_nulls_node *node;
2775         unsigned int expected = 1;
2776         struct sock *sk;
2777
2778         sock_hold(start_sk);
2779         iter->batch[iter->end_sk++] = start_sk;
2780
2781         sk = sk_nulls_next(start_sk);
2782         sk_nulls_for_each_from(sk, node) {
2783                 if (seq_sk_match(seq, sk)) {
2784                         if (iter->end_sk < iter->max_sk) {
2785                                 sock_hold(sk);
2786                                 iter->batch[iter->end_sk++] = sk;
2787                         }
2788                         expected++;
2789                 }
2790         }
2791         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2792
2793         return expected;
2794 }
2795
2796 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2797 {
2798         struct bpf_tcp_iter_state *iter = seq->private;
2799         struct tcp_iter_state *st = &iter->state;
2800         unsigned int expected;
2801         bool resized = false;
2802         struct sock *sk;
2803
2804         /* The st->bucket is done.  Directly advance to the next
2805          * bucket instead of having the tcp_seek_last_pos() to skip
2806          * one by one in the current bucket and eventually find out
2807          * it has to advance to the next bucket.
2808          */
2809         if (iter->st_bucket_done) {
2810                 st->offset = 0;
2811                 st->bucket++;
2812                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2813                     st->bucket > tcp_hashinfo.lhash2_mask) {
2814                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2815                         st->bucket = 0;
2816                 }
2817         }
2818
2819 again:
2820         /* Get a new batch */
2821         iter->cur_sk = 0;
2822         iter->end_sk = 0;
2823         iter->st_bucket_done = false;
2824
2825         sk = tcp_seek_last_pos(seq);
2826         if (!sk)
2827                 return NULL; /* Done */
2828
2829         if (st->state == TCP_SEQ_STATE_LISTENING)
2830                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2831         else
2832                 expected = bpf_iter_tcp_established_batch(seq, sk);
2833
2834         if (iter->end_sk == expected) {
2835                 iter->st_bucket_done = true;
2836                 return sk;
2837         }
2838
2839         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2840                 resized = true;
2841                 goto again;
2842         }
2843
2844         return sk;
2845 }
2846
2847 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2848 {
2849         /* bpf iter does not support lseek, so it always
2850          * continue from where it was stop()-ped.
2851          */
2852         if (*pos)
2853                 return bpf_iter_tcp_batch(seq);
2854
2855         return SEQ_START_TOKEN;
2856 }
2857
2858 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2859 {
2860         struct bpf_tcp_iter_state *iter = seq->private;
2861         struct tcp_iter_state *st = &iter->state;
2862         struct sock *sk;
2863
2864         /* Whenever seq_next() is called, the iter->cur_sk is
2865          * done with seq_show(), so advance to the next sk in
2866          * the batch.
2867          */
2868         if (iter->cur_sk < iter->end_sk) {
2869                 /* Keeping st->num consistent in tcp_iter_state.
2870                  * bpf_iter_tcp does not use st->num.
2871                  * meta.seq_num is used instead.
2872                  */
2873                 st->num++;
2874                 /* Move st->offset to the next sk in the bucket such that
2875                  * the future start() will resume at st->offset in
2876                  * st->bucket.  See tcp_seek_last_pos().
2877                  */
2878                 st->offset++;
2879                 sock_put(iter->batch[iter->cur_sk++]);
2880         }
2881
2882         if (iter->cur_sk < iter->end_sk)
2883                 sk = iter->batch[iter->cur_sk];
2884         else
2885                 sk = bpf_iter_tcp_batch(seq);
2886
2887         ++*pos;
2888         /* Keeping st->last_pos consistent in tcp_iter_state.
2889          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2890          */
2891         st->last_pos = *pos;
2892         return sk;
2893 }
2894
2895 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2896 {
2897         struct bpf_iter_meta meta;
2898         struct bpf_prog *prog;
2899         struct sock *sk = v;
2900         bool slow;
2901         uid_t uid;
2902         int ret;
2903
2904         if (v == SEQ_START_TOKEN)
2905                 return 0;
2906
2907         if (sk_fullsock(sk))
2908                 slow = lock_sock_fast(sk);
2909
2910         if (unlikely(sk_unhashed(sk))) {
2911                 ret = SEQ_SKIP;
2912                 goto unlock;
2913         }
2914
2915         if (sk->sk_state == TCP_TIME_WAIT) {
2916                 uid = 0;
2917         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2918                 const struct request_sock *req = v;
2919
2920                 uid = from_kuid_munged(seq_user_ns(seq),
2921                                        sock_i_uid(req->rsk_listener));
2922         } else {
2923                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2924         }
2925
2926         meta.seq = seq;
2927         prog = bpf_iter_get_info(&meta, false);
2928         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2929
2930 unlock:
2931         if (sk_fullsock(sk))
2932                 unlock_sock_fast(sk, slow);
2933         return ret;
2934
2935 }
2936
2937 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2938 {
2939         struct bpf_tcp_iter_state *iter = seq->private;
2940         struct bpf_iter_meta meta;
2941         struct bpf_prog *prog;
2942
2943         if (!v) {
2944                 meta.seq = seq;
2945                 prog = bpf_iter_get_info(&meta, true);
2946                 if (prog)
2947                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2948         }
2949
2950         if (iter->cur_sk < iter->end_sk) {
2951                 bpf_iter_tcp_put_batch(iter);
2952                 iter->st_bucket_done = false;
2953         }
2954 }
2955
2956 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2957         .show           = bpf_iter_tcp_seq_show,
2958         .start          = bpf_iter_tcp_seq_start,
2959         .next           = bpf_iter_tcp_seq_next,
2960         .stop           = bpf_iter_tcp_seq_stop,
2961 };
2962 #endif
2963 static unsigned short seq_file_family(const struct seq_file *seq)
2964 {
2965         const struct tcp_seq_afinfo *afinfo;
2966
2967 #ifdef CONFIG_BPF_SYSCALL
2968         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2969         if (seq->op == &bpf_iter_tcp_seq_ops)
2970                 return AF_UNSPEC;
2971 #endif
2972
2973         /* Iterated from proc fs */
2974         afinfo = PDE_DATA(file_inode(seq->file));
2975         return afinfo->family;
2976 }
2977
2978 static const struct seq_operations tcp4_seq_ops = {
2979         .show           = tcp4_seq_show,
2980         .start          = tcp_seq_start,
2981         .next           = tcp_seq_next,
2982         .stop           = tcp_seq_stop,
2983 };
2984
2985 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2986         .family         = AF_INET,
2987 };
2988
2989 static int __net_init tcp4_proc_init_net(struct net *net)
2990 {
2991         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2992                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2993                 return -ENOMEM;
2994         return 0;
2995 }
2996
2997 static void __net_exit tcp4_proc_exit_net(struct net *net)
2998 {
2999         remove_proc_entry("tcp", net->proc_net);
3000 }
3001
3002 static struct pernet_operations tcp4_net_ops = {
3003         .init = tcp4_proc_init_net,
3004         .exit = tcp4_proc_exit_net,
3005 };
3006
3007 int __init tcp4_proc_init(void)
3008 {
3009         return register_pernet_subsys(&tcp4_net_ops);
3010 }
3011
3012 void tcp4_proc_exit(void)
3013 {
3014         unregister_pernet_subsys(&tcp4_net_ops);
3015 }
3016 #endif /* CONFIG_PROC_FS */
3017
3018 /* @wake is one when sk_stream_write_space() calls us.
3019  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3020  * This mimics the strategy used in sock_def_write_space().
3021  */
3022 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3023 {
3024         const struct tcp_sock *tp = tcp_sk(sk);
3025         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3026                             READ_ONCE(tp->snd_nxt);
3027
3028         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3029 }
3030 EXPORT_SYMBOL(tcp_stream_memory_free);
3031
3032 struct proto tcp_prot = {
3033         .name                   = "TCP",
3034         .owner                  = THIS_MODULE,
3035         .close                  = tcp_close,
3036         .pre_connect            = tcp_v4_pre_connect,
3037         .connect                = tcp_v4_connect,
3038         .disconnect             = tcp_disconnect,
3039         .accept                 = inet_csk_accept,
3040         .ioctl                  = tcp_ioctl,
3041         .init                   = tcp_v4_init_sock,
3042         .destroy                = tcp_v4_destroy_sock,
3043         .shutdown               = tcp_shutdown,
3044         .setsockopt             = tcp_setsockopt,
3045         .getsockopt             = tcp_getsockopt,
3046         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3047         .keepalive              = tcp_set_keepalive,
3048         .recvmsg                = tcp_recvmsg,
3049         .sendmsg                = tcp_sendmsg,
3050         .sendpage               = tcp_sendpage,
3051         .backlog_rcv            = tcp_v4_do_rcv,
3052         .release_cb             = tcp_release_cb,
3053         .hash                   = inet_hash,
3054         .unhash                 = inet_unhash,
3055         .get_port               = inet_csk_get_port,
3056 #ifdef CONFIG_BPF_SYSCALL
3057         .psock_update_sk_prot   = tcp_bpf_update_proto,
3058 #endif
3059         .enter_memory_pressure  = tcp_enter_memory_pressure,
3060         .leave_memory_pressure  = tcp_leave_memory_pressure,
3061         .stream_memory_free     = tcp_stream_memory_free,
3062         .sockets_allocated      = &tcp_sockets_allocated,
3063         .orphan_count           = &tcp_orphan_count,
3064         .memory_allocated       = &tcp_memory_allocated,
3065         .memory_pressure        = &tcp_memory_pressure,
3066         .sysctl_mem             = sysctl_tcp_mem,
3067         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3068         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3069         .max_header             = MAX_TCP_HEADER,
3070         .obj_size               = sizeof(struct tcp_sock),
3071         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3072         .twsk_prot              = &tcp_timewait_sock_ops,
3073         .rsk_prot               = &tcp_request_sock_ops,
3074         .h.hashinfo             = &tcp_hashinfo,
3075         .no_autobind            = true,
3076         .diag_destroy           = tcp_abort,
3077 };
3078 EXPORT_SYMBOL(tcp_prot);
3079
3080 static void __net_exit tcp_sk_exit(struct net *net)
3081 {
3082         int cpu;
3083
3084         if (net->ipv4.tcp_congestion_control)
3085                 bpf_module_put(net->ipv4.tcp_congestion_control,
3086                                net->ipv4.tcp_congestion_control->owner);
3087
3088         for_each_possible_cpu(cpu)
3089                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3090         free_percpu(net->ipv4.tcp_sk);
3091 }
3092
3093 static int __net_init tcp_sk_init(struct net *net)
3094 {
3095         int res, cpu, cnt;
3096
3097         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3098         if (!net->ipv4.tcp_sk)
3099                 return -ENOMEM;
3100
3101         for_each_possible_cpu(cpu) {
3102                 struct sock *sk;
3103
3104                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3105                                            IPPROTO_TCP, net);
3106                 if (res)
3107                         goto fail;
3108                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3109
3110                 /* Please enforce IP_DF and IPID==0 for RST and
3111                  * ACK sent in SYN-RECV and TIME-WAIT state.
3112                  */
3113                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3114
3115                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3116         }
3117
3118         net->ipv4.sysctl_tcp_ecn = 2;
3119         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3120
3121         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3122         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3123         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3124         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3125         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3126
3127         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3128         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3129         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3130
3131         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3132         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3133         net->ipv4.sysctl_tcp_syncookies = 1;
3134         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3135         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3136         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3137         net->ipv4.sysctl_tcp_orphan_retries = 0;
3138         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3139         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3140         net->ipv4.sysctl_tcp_tw_reuse = 2;
3141         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3142
3143         cnt = tcp_hashinfo.ehash_mask + 1;
3144         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3145         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3146
3147         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3148         net->ipv4.sysctl_tcp_sack = 1;
3149         net->ipv4.sysctl_tcp_window_scaling = 1;
3150         net->ipv4.sysctl_tcp_timestamps = 1;
3151         net->ipv4.sysctl_tcp_early_retrans = 3;
3152         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3153         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3154         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3155         net->ipv4.sysctl_tcp_max_reordering = 300;
3156         net->ipv4.sysctl_tcp_dsack = 1;
3157         net->ipv4.sysctl_tcp_app_win = 31;
3158         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3159         net->ipv4.sysctl_tcp_frto = 2;
3160         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3161         /* This limits the percentage of the congestion window which we
3162          * will allow a single TSO frame to consume.  Building TSO frames
3163          * which are too large can cause TCP streams to be bursty.
3164          */
3165         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3166         /* Default TSQ limit of 16 TSO segments */
3167         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3168         /* rfc5961 challenge ack rate limiting */
3169         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3170         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3171         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3172         net->ipv4.sysctl_tcp_autocorking = 1;
3173         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3174         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3175         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3176         if (net != &init_net) {
3177                 memcpy(net->ipv4.sysctl_tcp_rmem,
3178                        init_net.ipv4.sysctl_tcp_rmem,
3179                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3180                 memcpy(net->ipv4.sysctl_tcp_wmem,
3181                        init_net.ipv4.sysctl_tcp_wmem,
3182                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3183         }
3184         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3185         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3186         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3187         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3188         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3189         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3190
3191         /* Reno is always built in */
3192         if (!net_eq(net, &init_net) &&
3193             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3194                                init_net.ipv4.tcp_congestion_control->owner))
3195                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3196         else
3197                 net->ipv4.tcp_congestion_control = &tcp_reno;
3198
3199         return 0;
3200 fail:
3201         tcp_sk_exit(net);
3202
3203         return res;
3204 }
3205
3206 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3207 {
3208         struct net *net;
3209
3210         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3211
3212         list_for_each_entry(net, net_exit_list, exit_list)
3213                 tcp_fastopen_ctx_destroy(net);
3214 }
3215
3216 static struct pernet_operations __net_initdata tcp_sk_ops = {
3217        .init       = tcp_sk_init,
3218        .exit       = tcp_sk_exit,
3219        .exit_batch = tcp_sk_exit_batch,
3220 };
3221
3222 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3223 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3224                      struct sock_common *sk_common, uid_t uid)
3225
3226 #define INIT_BATCH_SZ 16
3227
3228 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3229 {
3230         struct bpf_tcp_iter_state *iter = priv_data;
3231         int err;
3232
3233         err = bpf_iter_init_seq_net(priv_data, aux);
3234         if (err)
3235                 return err;
3236
3237         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3238         if (err) {
3239                 bpf_iter_fini_seq_net(priv_data);
3240                 return err;
3241         }
3242
3243         return 0;
3244 }
3245
3246 static void bpf_iter_fini_tcp(void *priv_data)
3247 {
3248         struct bpf_tcp_iter_state *iter = priv_data;
3249
3250         bpf_iter_fini_seq_net(priv_data);
3251         kvfree(iter->batch);
3252 }
3253
3254 static const struct bpf_iter_seq_info tcp_seq_info = {
3255         .seq_ops                = &bpf_iter_tcp_seq_ops,
3256         .init_seq_private       = bpf_iter_init_tcp,
3257         .fini_seq_private       = bpf_iter_fini_tcp,
3258         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3259 };
3260
3261 static const struct bpf_func_proto *
3262 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3263                             const struct bpf_prog *prog)
3264 {
3265         switch (func_id) {
3266         case BPF_FUNC_setsockopt:
3267                 return &bpf_sk_setsockopt_proto;
3268         case BPF_FUNC_getsockopt:
3269                 return &bpf_sk_getsockopt_proto;
3270         default:
3271                 return NULL;
3272         }
3273 }
3274
3275 static struct bpf_iter_reg tcp_reg_info = {
3276         .target                 = "tcp",
3277         .ctx_arg_info_size      = 1,
3278         .ctx_arg_info           = {
3279                 { offsetof(struct bpf_iter__tcp, sk_common),
3280                   PTR_TO_BTF_ID_OR_NULL },
3281         },
3282         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3283         .seq_info               = &tcp_seq_info,
3284 };
3285
3286 static void __init bpf_iter_register(void)
3287 {
3288         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3289         if (bpf_iter_reg_target(&tcp_reg_info))
3290                 pr_warn("Warning: could not register bpf iterator tcp\n");
3291 }
3292
3293 #endif
3294
3295 void __init tcp_v4_init(void)
3296 {
3297         if (register_pernet_subsys(&tcp_sk_ops))
3298                 panic("Failed to create the TCP control socket.\n");
3299
3300 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3301         bpf_iter_register();
3302 #endif
3303 }