net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = tcp_sk(sk)->mtu_info;
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         tp->mtu_info = info;
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * is already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk->sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk->sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 659 {
 660         const struct tcphdr *th = tcp_hdr(skb);
 661         struct {
 662                 struct tcphdr th;
 663 #ifdef CONFIG_TCP_MD5SIG
 664                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 665 #endif
 666         } rep;
 667         struct ip_reply_arg arg;
 668 #ifdef CONFIG_TCP_MD5SIG
 669         struct tcp_md5sig_key *key = NULL;
 670         const __u8 *hash_location = NULL;
 671         unsigned char newhash[16];
 672         int genhash;
 673         struct sock *sk1 = NULL;
 674 #endif
 675         u64 transmit_time = 0;
 676         struct sock *ctl_sk;
 677         struct net *net;
 678
 679         /* Never send a reset in response to a reset. */
 680         if (th->rst)
 681                 return;
 682
 683         /* If sk not NULL, it means we did a successful lookup and incoming
 684          * route had to be correct. prequeue might have dropped our dst.
 685          */
 686         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rep, 0, sizeof(rep));
 691         rep.th.dest   = th->source;
 692         rep.th.source = th->dest;
 693         rep.th.doff   = sizeof(struct tcphdr) / 4;
 694         rep.th.rst    = 1;
 695
 696         if (th->ack) {
 697                 rep.th.seq = th->ack_seq;
 698         } else {
 699                 rep.th.ack = 1;
 700                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                        skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof(arg));
 705         arg.iov[0].iov_base = (unsigned char *)&rep;
 706         arg.iov[0].iov_len  = sizeof(rep.th);
 707
 708         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 709 #ifdef CONFIG_TCP_MD5SIG
 710         rcu_read_lock();
 711         hash_location = tcp_parse_md5sig_option(th);
 712         if (sk && sk_fullsock(sk)) {
 713                 const union tcp_md5_addr *addr;
 714                 int l3index;
 715
 716                 /* sdif set, means packet ingressed via a device
 717                  * in an L3 domain and inet_iif is set to it.
 718                  */
 719                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 720                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 721                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 722         } else if (hash_location) {
 723                 const union tcp_md5_addr *addr;
 724                 int sdif = tcp_v4_sdif(skb);
 725                 int dif = inet_iif(skb);
 726                 int l3index;
 727
 728                 /*
 729                  * active side is lost. Try to find listening socket through
 730                  * source port, and then find md5 key through listening socket.
 731                  * we are not loose security here:
 732                  * Incoming packet is checked with md5 hash with finding key,
 733                  * no RST generated if md5 hash doesn't match.
 734                  */
 735                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 736                                              ip_hdr(skb)->saddr,
 737                                              th->source, ip_hdr(skb)->daddr,
 738                                              ntohs(th->source), dif, sdif);
 739                 /* don't send rst if it can't find key */
 740                 if (!sk1)
 741                         goto out;
 742
 743                 /* sdif set, means packet ingressed via a device
 744                  * in an L3 domain and dif is set to it.
 745                  */
 746                 l3index = sdif ? dif : 0;
 747                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 748                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 749                 if (!key)
 750                         goto out;
 751
 752
 753                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 754                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 755                         goto out;
 756
 757         }
 758
 759         if (key) {
 760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 761                                    (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_MD5SIG << 8) |
 763                                    TCPOLEN_MD5SIG);
 764                 /* Update length and the length the header thinks exists */
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len / 4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 769                                      key, ip_hdr(skb)->saddr,
 770                                      ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 774                                       ip_hdr(skb)->saddr, /* XXX */
 775                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 776         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 777         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 778
 779         /* When socket is gone, all binding information is lost.
 780          * routing might fail in this case. No choice here, if we choose to force
 781          * input interface, we will misroute in case of asymmetric route.
 782          */
 783         if (sk) {
 784                 arg.bound_dev_if = sk->sk_bound_dev_if;
 785                 if (sk_fullsock(sk))
 786                         trace_tcp_send_reset(sk, skb);
 787         }
 788
 789         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 790                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 791
 792         arg.tos = ip_hdr(skb)->tos;
 793         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 794         local_bh_disable();
 795         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 796         if (sk) {
 797                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 798                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 799                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 800                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 801                 transmit_time = tcp_transmit_time(sk);
 802         }
 803         ip_send_unicast_reply(ctl_sk,
 804                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 805                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 806                               &arg, arg.iov[0].iov_len,
 807                               transmit_time);
 808
 809         ctl_sk->sk_mark = 0;
 810         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 811         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 812         local_bh_enable();
 813
 814 #ifdef CONFIG_TCP_MD5SIG
 815 out:
 816         rcu_read_unlock();
 817 #endif
 818 }
 819
 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 821    outside socket context is ugly, certainly. What can I do?
 822  */
 823
 824 static void tcp_v4_send_ack(const struct sock *sk,
 825                             struct sk_buff *skb, u32 seq, u32 ack,
 826                             u32 win, u32 tsval, u32 tsecr, int oif,
 827                             struct tcp_md5sig_key *key,
 828                             int reply_flags, u8 tos)
 829 {
 830         const struct tcphdr *th = tcp_hdr(skb);
 831         struct {
 832                 struct tcphdr th;
 833                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 834 #ifdef CONFIG_TCP_MD5SIG
 835                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 836 #endif
 837                         ];
 838         } rep;
 839         struct net *net = sock_net(sk);
 840         struct ip_reply_arg arg;
 841         struct sock *ctl_sk;
 842         u64 transmit_time;
 843
 844         memset(&rep.th, 0, sizeof(struct tcphdr));
 845         memset(&arg, 0, sizeof(arg));
 846
 847         arg.iov[0].iov_base = (unsigned char *)&rep;
 848         arg.iov[0].iov_len  = sizeof(rep.th);
 849         if (tsecr) {
 850                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 851                                    (TCPOPT_TIMESTAMP << 8) |
 852                                    TCPOLEN_TIMESTAMP);
 853                 rep.opt[1] = htonl(tsval);
 854                 rep.opt[2] = htonl(tsecr);
 855                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 856         }
 857
 858         /* Swap the send and the receive. */
 859         rep.th.dest    = th->source;
 860         rep.th.source  = th->dest;
 861         rep.th.doff    = arg.iov[0].iov_len / 4;
 862         rep.th.seq     = htonl(seq);
 863         rep.th.ack_seq = htonl(ack);
 864         rep.th.ack     = 1;
 865         rep.th.window  = htons(win);
 866
 867 #ifdef CONFIG_TCP_MD5SIG
 868         if (key) {
 869                 int offset = (tsecr) ? 3 : 0;
 870
 871                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 872                                           (TCPOPT_NOP << 16) |
 873                                           (TCPOPT_MD5SIG << 8) |
 874                                           TCPOLEN_MD5SIG);
 875                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 876                 rep.th.doff = arg.iov[0].iov_len/4;
 877
 878                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 879                                     key, ip_hdr(skb)->saddr,
 880                                     ip_hdr(skb)->daddr, &rep.th);
 881         }
 882 #endif
 883         arg.flags = reply_flags;
 884         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 885                                       ip_hdr(skb)->saddr, /* XXX */
 886                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 887         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 888         if (oif)
 889                 arg.bound_dev_if = oif;
 890         arg.tos = tos;
 891         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 892         local_bh_disable();
 893         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 894         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 895                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 896         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 897                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 898         transmit_time = tcp_transmit_time(sk);
 899         ip_send_unicast_reply(ctl_sk,
 900                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 901                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 902                               &arg, arg.iov[0].iov_len,
 903                               transmit_time);
 904
 905         ctl_sk->sk_mark = 0;
 906         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 907         local_bh_enable();
 908 }
 909
 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 911 {
 912         struct inet_timewait_sock *tw = inet_twsk(sk);
 913         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 914
 915         tcp_v4_send_ack(sk, skb,
 916                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 917                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 918                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 919                         tcptw->tw_ts_recent,
 920                         tw->tw_bound_dev_if,
 921                         tcp_twsk_md5_key(tcptw),
 922                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                         tw->tw_tos
 924                         );
 925
 926         inet_twsk_put(tw);
 927 }
 928
 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 930                                   struct request_sock *req)
 931 {
 932         const union tcp_md5_addr *addr;
 933         int l3index;
 934
 935         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 936          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 937          */
 938         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 939                                              tcp_sk(sk)->snd_nxt;
 940
 941         /* RFC 7323 2.3
 942          * The window field (SEG.WND) of every outgoing segment, with the
 943          * exception of <SYN> segments, MUST be right-shifted by
 944          * Rcv.Wind.Shift bits:
 945          */
 946         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 947         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 948         tcp_v4_send_ack(sk, skb, seq,
 949                         tcp_rsk(req)->rcv_nxt,
 950                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 951                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 952                         req->ts_recent,
 953                         0,
 954                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 955                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 956                         ip_hdr(skb)->tos);
 957 }
 958
 959 /*
 960  *      Send a SYN-ACK after having received a SYN.
 961  *      This still operates on a request_sock only, not on a big
 962  *      socket.
 963  */
 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 965                               struct flowi *fl,
 966                               struct request_sock *req,
 967                               struct tcp_fastopen_cookie *foc,
 968                               enum tcp_synack_type synack_type)
 969 {
 970         const struct inet_request_sock *ireq = inet_rsk(req);
 971         struct flowi4 fl4;
 972         int err = -1;
 973         struct sk_buff *skb;
 974
 975         /* First, grab a route. */
 976         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 977                 return -1;
 978
 979         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 980
 981         if (skb) {
 982                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 983
 984                 rcu_read_lock();
 985                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 986                                             ireq->ir_rmt_addr,
 987                                             rcu_dereference(ireq->ireq_opt));
 988                 rcu_read_unlock();
 989                 err = net_xmit_eval(err);
 990         }
 991
 992         return err;
 993 }
 994
 995 /*
 996  *      IPv4 request_sock destructor.
 997  */
 998 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 999 {
1000         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1001 }
1002
1003 #ifdef CONFIG_TCP_MD5SIG
1004 /*
1005  * RFC2385 MD5 checksumming requires a mapping of
1006  * IP address->MD5 Key.
1007  * We need to maintain these in the sk structure.
1008  */
1009
1010 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1011 EXPORT_SYMBOL(tcp_md5_needed);
1012
1013 /* Find the Key structure for an address.  */
1014 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1015                                            const union tcp_md5_addr *addr,
1016                                            int family)
1017 {
1018         const struct tcp_sock *tp = tcp_sk(sk);
1019         struct tcp_md5sig_key *key;
1020         const struct tcp_md5sig_info *md5sig;
1021         __be32 mask;
1022         struct tcp_md5sig_key *best_match = NULL;
1023         bool match;
1024
1025         /* caller either holds rcu_read_lock() or socket lock */
1026         md5sig = rcu_dereference_check(tp->md5sig_info,
1027                                        lockdep_sock_is_held(sk));
1028         if (!md5sig)
1029                 return NULL;
1030
1031         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1032                                  lockdep_sock_is_held(sk)) {
1033                 if (key->family != family)
1034                         continue;
1035                 if (key->l3index && key->l3index != l3index)
1036                         continue;
1037                 if (family == AF_INET) {
1038                         mask = inet_make_mask(key->prefixlen);
1039                         match = (key->addr.a4.s_addr & mask) ==
1040                                 (addr->a4.s_addr & mask);
1041 #if IS_ENABLED(CONFIG_IPV6)
1042                 } else if (family == AF_INET6) {
1043                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1044                                                   key->prefixlen);
1045 #endif
1046                 } else {
1047                         match = false;
1048                 }
1049
1050                 if (match && (!best_match ||
1051                               key->prefixlen > best_match->prefixlen))
1052                         best_match = key;
1053         }
1054         return best_match;
1055 }
1056 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1057
1058 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1059                                                       const union tcp_md5_addr *addr,
1060                                                       int family, u8 prefixlen,
1061                                                       int l3index)
1062 {
1063         const struct tcp_sock *tp = tcp_sk(sk);
1064         struct tcp_md5sig_key *key;
1065         unsigned int size = sizeof(struct in_addr);
1066         const struct tcp_md5sig_info *md5sig;
1067
1068         /* caller either holds rcu_read_lock() or socket lock */
1069         md5sig = rcu_dereference_check(tp->md5sig_info,
1070                                        lockdep_sock_is_held(sk));
1071         if (!md5sig)
1072                 return NULL;
1073 #if IS_ENABLED(CONFIG_IPV6)
1074         if (family == AF_INET6)
1075                 size = sizeof(struct in6_addr);
1076 #endif
1077         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1078                                  lockdep_sock_is_held(sk)) {
1079                 if (key->family != family)
1080                         continue;
1081                 if (key->l3index && key->l3index != l3index)
1082                         continue;
1083                 if (!memcmp(&key->addr, addr, size) &&
1084                     key->prefixlen == prefixlen)
1085                         return key;
1086         }
1087         return NULL;
1088 }
1089
1090 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1091                                          const struct sock *addr_sk)
1092 {
1093         const union tcp_md5_addr *addr;
1094         int l3index;
1095
1096         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1097                                                  addr_sk->sk_bound_dev_if);
1098         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1099         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1100 }
1101 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1102
1103 /* This can be called on a newly created socket, from other files */
1104 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1105                    int family, u8 prefixlen, int l3index,
1106                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1107 {
1108         /* Add Key to the list */
1109         struct tcp_md5sig_key *key;
1110         struct tcp_sock *tp = tcp_sk(sk);
1111         struct tcp_md5sig_info *md5sig;
1112
1113         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1114         if (key) {
1115                 /* Pre-existing entry - just update that one.
1116                  * Note that the key might be used concurrently.
1117                  * data_race() is telling kcsan that we do not care of
1118                  * key mismatches, since changing MD5 key on live flows
1119                  * can lead to packet drops.
1120                  */
1121                 data_race(memcpy(key->key, newkey, newkeylen));
1122
1123                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1124                  * Also note that a reader could catch new key->keylen value
1125                  * but old key->key[], this is the reason we use __GFP_ZERO
1126                  * at sock_kmalloc() time below these lines.
1127                  */
1128                 WRITE_ONCE(key->keylen, newkeylen);
1129
1130                 return 0;
1131         }
1132
1133         md5sig = rcu_dereference_protected(tp->md5sig_info,
1134                                            lockdep_sock_is_held(sk));
1135         if (!md5sig) {
1136                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1137                 if (!md5sig)
1138                         return -ENOMEM;
1139
1140                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1141                 INIT_HLIST_HEAD(&md5sig->head);
1142                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1143         }
1144
1145         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1146         if (!key)
1147                 return -ENOMEM;
1148         if (!tcp_alloc_md5sig_pool()) {
1149                 sock_kfree_s(sk, key, sizeof(*key));
1150                 return -ENOMEM;
1151         }
1152
1153         memcpy(key->key, newkey, newkeylen);
1154         key->keylen = newkeylen;
1155         key->family = family;
1156         key->prefixlen = prefixlen;
1157         key->l3index = l3index;
1158         memcpy(&key->addr, addr,
1159                (family == AF_INET6) ? sizeof(struct in6_addr) :
1160                                       sizeof(struct in_addr));
1161         hlist_add_head_rcu(&key->node, &md5sig->head);
1162         return 0;
1163 }
1164 EXPORT_SYMBOL(tcp_md5_do_add);
1165
1166 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1167                    u8 prefixlen, int l3index)
1168 {
1169         struct tcp_md5sig_key *key;
1170
1171         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1172         if (!key)
1173                 return -ENOENT;
1174         hlist_del_rcu(&key->node);
1175         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1176         kfree_rcu(key, rcu);
1177         return 0;
1178 }
1179 EXPORT_SYMBOL(tcp_md5_do_del);
1180
1181 static void tcp_clear_md5_list(struct sock *sk)
1182 {
1183         struct tcp_sock *tp = tcp_sk(sk);
1184         struct tcp_md5sig_key *key;
1185         struct hlist_node *n;
1186         struct tcp_md5sig_info *md5sig;
1187
1188         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1189
1190         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1191                 hlist_del_rcu(&key->node);
1192                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1193                 kfree_rcu(key, rcu);
1194         }
1195 }
1196
1197 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1198                                  sockptr_t optval, int optlen)
1199 {
1200         struct tcp_md5sig cmd;
1201         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1202         const union tcp_md5_addr *addr;
1203         u8 prefixlen = 32;
1204         int l3index = 0;
1205
1206         if (optlen < sizeof(cmd))
1207                 return -EINVAL;
1208
1209         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1210                 return -EFAULT;
1211
1212         if (sin->sin_family != AF_INET)
1213                 return -EINVAL;
1214
1215         if (optname == TCP_MD5SIG_EXT &&
1216             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1217                 prefixlen = cmd.tcpm_prefixlen;
1218                 if (prefixlen > 32)
1219                         return -EINVAL;
1220         }
1221
1222         if (optname == TCP_MD5SIG_EXT &&
1223             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1224                 struct net_device *dev;
1225
1226                 rcu_read_lock();
1227                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1228                 if (dev && netif_is_l3_master(dev))
1229                         l3index = dev->ifindex;
1230
1231                 rcu_read_unlock();
1232
1233                 /* ok to reference set/not set outside of rcu;
1234                  * right now device MUST be an L3 master
1235                  */
1236                 if (!dev || !l3index)
1237                         return -EINVAL;
1238         }
1239
1240         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1241
1242         if (!cmd.tcpm_keylen)
1243                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1244
1245         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1246                 return -EINVAL;
1247
1248         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1249                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1250 }
1251
1252 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1253                                    __be32 daddr, __be32 saddr,
1254                                    const struct tcphdr *th, int nbytes)
1255 {
1256         struct tcp4_pseudohdr *bp;
1257         struct scatterlist sg;
1258         struct tcphdr *_th;
1259
1260         bp = hp->scratch;
1261         bp->saddr = saddr;
1262         bp->daddr = daddr;
1263         bp->pad = 0;
1264         bp->protocol = IPPROTO_TCP;
1265         bp->len = cpu_to_be16(nbytes);
1266
1267         _th = (struct tcphdr *)(bp + 1);
1268         memcpy(_th, th, sizeof(*th));
1269         _th->check = 0;
1270
1271         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1272         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1273                                 sizeof(*bp) + sizeof(*th));
1274         return crypto_ahash_update(hp->md5_req);
1275 }
1276
1277 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1278                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1279 {
1280         struct tcp_md5sig_pool *hp;
1281         struct ahash_request *req;
1282
1283         hp = tcp_get_md5sig_pool();
1284         if (!hp)
1285                 goto clear_hash_noput;
1286         req = hp->md5_req;
1287
1288         if (crypto_ahash_init(req))
1289                 goto clear_hash;
1290         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1291                 goto clear_hash;
1292         if (tcp_md5_hash_key(hp, key))
1293                 goto clear_hash;
1294         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1295         if (crypto_ahash_final(req))
1296                 goto clear_hash;
1297
1298         tcp_put_md5sig_pool();
1299         return 0;
1300
1301 clear_hash:
1302         tcp_put_md5sig_pool();
1303 clear_hash_noput:
1304         memset(md5_hash, 0, 16);
1305         return 1;
1306 }
1307
1308 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1309                         const struct sock *sk,
1310                         const struct sk_buff *skb)
1311 {
1312         struct tcp_md5sig_pool *hp;
1313         struct ahash_request *req;
1314         const struct tcphdr *th = tcp_hdr(skb);
1315         __be32 saddr, daddr;
1316
1317         if (sk) { /* valid for establish/request sockets */
1318                 saddr = sk->sk_rcv_saddr;
1319                 daddr = sk->sk_daddr;
1320         } else {
1321                 const struct iphdr *iph = ip_hdr(skb);
1322                 saddr = iph->saddr;
1323                 daddr = iph->daddr;
1324         }
1325
1326         hp = tcp_get_md5sig_pool();
1327         if (!hp)
1328                 goto clear_hash_noput;
1329         req = hp->md5_req;
1330
1331         if (crypto_ahash_init(req))
1332                 goto clear_hash;
1333
1334         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1335                 goto clear_hash;
1336         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1337                 goto clear_hash;
1338         if (tcp_md5_hash_key(hp, key))
1339                 goto clear_hash;
1340         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341         if (crypto_ahash_final(req))
1342                 goto clear_hash;
1343
1344         tcp_put_md5sig_pool();
1345         return 0;
1346
1347 clear_hash:
1348         tcp_put_md5sig_pool();
1349 clear_hash_noput:
1350         memset(md5_hash, 0, 16);
1351         return 1;
1352 }
1353 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1354
1355 #endif
1356
1357 /* Called with rcu_read_lock() */
1358 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1359                                     const struct sk_buff *skb,
1360                                     int dif, int sdif)
1361 {
1362 #ifdef CONFIG_TCP_MD5SIG
1363         /*
1364          * This gets called for each TCP segment that arrives
1365          * so we want to be efficient.
1366          * We have 3 drop cases:
1367          * o No MD5 hash and one expected.
1368          * o MD5 hash and we're not expecting one.
1369          * o MD5 hash and its wrong.
1370          */
1371         const __u8 *hash_location = NULL;
1372         struct tcp_md5sig_key *hash_expected;
1373         const struct iphdr *iph = ip_hdr(skb);
1374         const struct tcphdr *th = tcp_hdr(skb);
1375         const union tcp_md5_addr *addr;
1376         unsigned char newhash[16];
1377         int genhash, l3index;
1378
1379         /* sdif set, means packet ingressed via a device
1380          * in an L3 domain and dif is set to the l3mdev
1381          */
1382         l3index = sdif ? dif : 0;
1383
1384         addr = (union tcp_md5_addr *)&iph->saddr;
1385         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1386         hash_location = tcp_parse_md5sig_option(th);
1387
1388         /* We've parsed the options - do we have a hash? */
1389         if (!hash_expected && !hash_location)
1390                 return false;
1391
1392         if (hash_expected && !hash_location) {
1393                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1394                 return true;
1395         }
1396
1397         if (!hash_expected && hash_location) {
1398                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1399                 return true;
1400         }
1401
1402         /* Okay, so this is hash_expected and hash_location -
1403          * so we need to calculate the checksum.
1404          */
1405         genhash = tcp_v4_md5_hash_skb(newhash,
1406                                       hash_expected,
1407                                       NULL, skb);
1408
1409         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1410                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1411                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1412                                      &iph->saddr, ntohs(th->source),
1413                                      &iph->daddr, ntohs(th->dest),
1414                                      genhash ? " tcp_v4_calc_md5_hash failed"
1415                                      : "", l3index);
1416                 return true;
1417         }
1418         return false;
1419 #endif
1420         return false;
1421 }
1422
1423 static void tcp_v4_init_req(struct request_sock *req,
1424                             const struct sock *sk_listener,
1425                             struct sk_buff *skb)
1426 {
1427         struct inet_request_sock *ireq = inet_rsk(req);
1428         struct net *net = sock_net(sk_listener);
1429
1430         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1431         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1432         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1433 }
1434
1435 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1436                                           struct flowi *fl,
1437                                           const struct request_sock *req)
1438 {
1439         return inet_csk_route_req(sk, &fl->u.ip4, req);
1440 }
1441
1442 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1443         .family         =       PF_INET,
1444         .obj_size       =       sizeof(struct tcp_request_sock),
1445         .rtx_syn_ack    =       tcp_rtx_synack,
1446         .send_ack       =       tcp_v4_reqsk_send_ack,
1447         .destructor     =       tcp_v4_reqsk_destructor,
1448         .send_reset     =       tcp_v4_send_reset,
1449         .syn_ack_timeout =      tcp_syn_ack_timeout,
1450 };
1451
1452 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1453         .mss_clamp      =       TCP_MSS_DEFAULT,
1454 #ifdef CONFIG_TCP_MD5SIG
1455         .req_md5_lookup =       tcp_v4_md5_lookup,
1456         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1457 #endif
1458         .init_req       =       tcp_v4_init_req,
1459 #ifdef CONFIG_SYN_COOKIES
1460         .cookie_init_seq =      cookie_v4_init_sequence,
1461 #endif
1462         .route_req      =       tcp_v4_route_req,
1463         .init_seq       =       tcp_v4_init_seq,
1464         .init_ts_off    =       tcp_v4_init_ts_off,
1465         .send_synack    =       tcp_v4_send_synack,
1466 };
1467
1468 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1469 {
1470         /* Never answer to SYNs send to broadcast or multicast */
1471         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1472                 goto drop;
1473
1474         return tcp_conn_request(&tcp_request_sock_ops,
1475                                 &tcp_request_sock_ipv4_ops, sk, skb);
1476
1477 drop:
1478         tcp_listendrop(sk);
1479         return 0;
1480 }
1481 EXPORT_SYMBOL(tcp_v4_conn_request);
1482
1483
1484 /*
1485  * The three way handshake has completed - we got a valid synack -
1486  * now create the new socket.
1487  */
1488 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1489                                   struct request_sock *req,
1490                                   struct dst_entry *dst,
1491                                   struct request_sock *req_unhash,
1492                                   bool *own_req)
1493 {
1494         struct inet_request_sock *ireq;
1495         struct inet_sock *newinet;
1496         struct tcp_sock *newtp;
1497         struct sock *newsk;
1498 #ifdef CONFIG_TCP_MD5SIG
1499         const union tcp_md5_addr *addr;
1500         struct tcp_md5sig_key *key;
1501         int l3index;
1502 #endif
1503         struct ip_options_rcu *inet_opt;
1504
1505         if (sk_acceptq_is_full(sk))
1506                 goto exit_overflow;
1507
1508         newsk = tcp_create_openreq_child(sk, req, skb);
1509         if (!newsk)
1510                 goto exit_nonewsk;
1511
1512         newsk->sk_gso_type = SKB_GSO_TCPV4;
1513         inet_sk_rx_dst_set(newsk, skb);
1514
1515         newtp                 = tcp_sk(newsk);
1516         newinet               = inet_sk(newsk);
1517         ireq                  = inet_rsk(req);
1518         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1519         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1520         newsk->sk_bound_dev_if = ireq->ir_iif;
1521         newinet->inet_saddr   = ireq->ir_loc_addr;
1522         inet_opt              = rcu_dereference(ireq->ireq_opt);
1523         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1524         newinet->mc_index     = inet_iif(skb);
1525         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1526         newinet->rcv_tos      = ip_hdr(skb)->tos;
1527         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1528         if (inet_opt)
1529                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1530         newinet->inet_id = prandom_u32();
1531
1532         if (!dst) {
1533                 dst = inet_csk_route_child_sock(sk, newsk, req);
1534                 if (!dst)
1535                         goto put_and_exit;
1536         } else {
1537                 /* syncookie case : see end of cookie_v4_check() */
1538         }
1539         sk_setup_caps(newsk, dst);
1540
1541         tcp_ca_openreq_child(newsk, dst);
1542
1543         tcp_sync_mss(newsk, dst_mtu(dst));
1544         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545
1546         tcp_initialize_rcv_mss(newsk);
1547
1548 #ifdef CONFIG_TCP_MD5SIG
1549         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550         /* Copy over the MD5 key from the original socket */
1551         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553         if (key) {
1554                 /*
1555                  * We're using one, so create a matching key
1556                  * on the newsk structure. If we fail to get
1557                  * memory, then we end up not copying the key
1558                  * across. Shucks.
1559                  */
1560                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1561                                key->key, key->keylen, GFP_ATOMIC);
1562                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1563         }
1564 #endif
1565
1566         if (__inet_inherit_port(sk, newsk) < 0)
1567                 goto put_and_exit;
1568         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1569         if (likely(*own_req)) {
1570                 tcp_move_syn(newtp, req);
1571                 ireq->ireq_opt = NULL;
1572         } else {
1573                 newinet->inet_opt = NULL;
1574         }
1575         return newsk;
1576
1577 exit_overflow:
1578         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1579 exit_nonewsk:
1580         dst_release(dst);
1581 exit:
1582         tcp_listendrop(sk);
1583         return NULL;
1584 put_and_exit:
1585         newinet->inet_opt = NULL;
1586         inet_csk_prepare_forced_close(newsk);
1587         tcp_done(newsk);
1588         goto exit;
1589 }
1590 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1591
1592 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1593 {
1594 #ifdef CONFIG_SYN_COOKIES
1595         const struct tcphdr *th = tcp_hdr(skb);
1596
1597         if (!th->syn)
1598                 sk = cookie_v4_check(sk, skb);
1599 #endif
1600         return sk;
1601 }
1602
1603 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1604                          struct tcphdr *th, u32 *cookie)
1605 {
1606         u16 mss = 0;
1607 #ifdef CONFIG_SYN_COOKIES
1608         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1609                                     &tcp_request_sock_ipv4_ops, sk, th);
1610         if (mss) {
1611                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1612                 tcp_synq_overflow(sk);
1613         }
1614 #endif
1615         return mss;
1616 }
1617
1618 /* The socket must have it's spinlock held when we get
1619  * here, unless it is a TCP_LISTEN socket.
1620  *
1621  * We have a potential double-lock case here, so even when
1622  * doing backlog processing we use the BH locking scheme.
1623  * This is because we cannot sleep with the original spinlock
1624  * held.
1625  */
1626 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1627 {
1628         struct sock *rsk;
1629
1630         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1631                 struct dst_entry *dst = sk->sk_rx_dst;
1632
1633                 sock_rps_save_rxhash(sk, skb);
1634                 sk_mark_napi_id(sk, skb);
1635                 if (dst) {
1636                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1637                             !dst->ops->check(dst, 0)) {
1638                                 dst_release(dst);
1639                                 sk->sk_rx_dst = NULL;
1640                         }
1641                 }
1642                 tcp_rcv_established(sk, skb);
1643                 return 0;
1644         }
1645
1646         if (tcp_checksum_complete(skb))
1647                 goto csum_err;
1648
1649         if (sk->sk_state == TCP_LISTEN) {
1650                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1651
1652                 if (!nsk)
1653                         goto discard;
1654                 if (nsk != sk) {
1655                         if (tcp_child_process(sk, nsk, skb)) {
1656                                 rsk = nsk;
1657                                 goto reset;
1658                         }
1659                         return 0;
1660                 }
1661         } else
1662                 sock_rps_save_rxhash(sk, skb);
1663
1664         if (tcp_rcv_state_process(sk, skb)) {
1665                 rsk = sk;
1666                 goto reset;
1667         }
1668         return 0;
1669
1670 reset:
1671         tcp_v4_send_reset(rsk, skb);
1672 discard:
1673         kfree_skb(skb);
1674         /* Be careful here. If this function gets more complicated and
1675          * gcc suffers from register pressure on the x86, sk (in %ebx)
1676          * might be destroyed here. This current version compiles correctly,
1677          * but you have been warned.
1678          */
1679         return 0;
1680
1681 csum_err:
1682         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1683         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1684         goto discard;
1685 }
1686 EXPORT_SYMBOL(tcp_v4_do_rcv);
1687
1688 int tcp_v4_early_demux(struct sk_buff *skb)
1689 {
1690         const struct iphdr *iph;
1691         const struct tcphdr *th;
1692         struct sock *sk;
1693
1694         if (skb->pkt_type != PACKET_HOST)
1695                 return 0;
1696
1697         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1698                 return 0;
1699
1700         iph = ip_hdr(skb);
1701         th = tcp_hdr(skb);
1702
1703         if (th->doff < sizeof(struct tcphdr) / 4)
1704                 return 0;
1705
1706         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1707                                        iph->saddr, th->source,
1708                                        iph->daddr, ntohs(th->dest),
1709                                        skb->skb_iif, inet_sdif(skb));
1710         if (sk) {
1711                 skb->sk = sk;
1712                 skb->destructor = sock_edemux;
1713                 if (sk_fullsock(sk)) {
1714                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1715
1716                         if (dst)
1717                                 dst = dst_check(dst, 0);
1718                         if (dst &&
1719                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1720                                 skb_dst_set_noref(skb, dst);
1721                 }
1722         }
1723         return 0;
1724 }
1725
1726 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1727 {
1728         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1729         struct skb_shared_info *shinfo;
1730         const struct tcphdr *th;
1731         struct tcphdr *thtail;
1732         struct sk_buff *tail;
1733         unsigned int hdrlen;
1734         bool fragstolen;
1735         u32 gso_segs;
1736         int delta;
1737
1738         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1739          * we can fix skb->truesize to its real value to avoid future drops.
1740          * This is valid because skb is not yet charged to the socket.
1741          * It has been noticed pure SACK packets were sometimes dropped
1742          * (if cooked by drivers without copybreak feature).
1743          */
1744         skb_condense(skb);
1745
1746         skb_dst_drop(skb);
1747
1748         if (unlikely(tcp_checksum_complete(skb))) {
1749                 bh_unlock_sock(sk);
1750                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1751                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1752                 return true;
1753         }
1754
1755         /* Attempt coalescing to last skb in backlog, even if we are
1756          * above the limits.
1757          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1758          */
1759         th = (const struct tcphdr *)skb->data;
1760         hdrlen = th->doff * 4;
1761         shinfo = skb_shinfo(skb);
1762
1763         if (!shinfo->gso_size)
1764                 shinfo->gso_size = skb->len - hdrlen;
1765
1766         if (!shinfo->gso_segs)
1767                 shinfo->gso_segs = 1;
1768
1769         tail = sk->sk_backlog.tail;
1770         if (!tail)
1771                 goto no_coalesce;
1772         thtail = (struct tcphdr *)tail->data;
1773
1774         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1775             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1776             ((TCP_SKB_CB(tail)->tcp_flags |
1777               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1778             !((TCP_SKB_CB(tail)->tcp_flags &
1779               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1780             ((TCP_SKB_CB(tail)->tcp_flags ^
1781               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1782 #ifdef CONFIG_TLS_DEVICE
1783             tail->decrypted != skb->decrypted ||
1784 #endif
1785             thtail->doff != th->doff ||
1786             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1787                 goto no_coalesce;
1788
1789         __skb_pull(skb, hdrlen);
1790         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1791                 thtail->window = th->window;
1792
1793                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1794
1795                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1796                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1797
1798                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1799                  * thtail->fin, so that the fast path in tcp_rcv_established()
1800                  * is not entered if we append a packet with a FIN.
1801                  * SYN, RST, URG are not present.
1802                  * ACK is set on both packets.
1803                  * PSH : we do not really care in TCP stack,
1804                  *       at least for 'GRO' packets.
1805                  */
1806                 thtail->fin |= th->fin;
1807                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1808
1809                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1810                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1811                         tail->tstamp = skb->tstamp;
1812                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1813                 }
1814
1815                 /* Not as strict as GRO. We only need to carry mss max value */
1816                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1817                                                  skb_shinfo(tail)->gso_size);
1818
1819                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1820                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1821
1822                 sk->sk_backlog.len += delta;
1823                 __NET_INC_STATS(sock_net(sk),
1824                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1825                 kfree_skb_partial(skb, fragstolen);
1826                 return false;
1827         }
1828         __skb_push(skb, hdrlen);
1829
1830 no_coalesce:
1831         /* Only socket owner can try to collapse/prune rx queues
1832          * to reduce memory overhead, so add a little headroom here.
1833          * Few sockets backlog are possibly concurrently non empty.
1834          */
1835         limit += 64*1024;
1836
1837         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1838                 bh_unlock_sock(sk);
1839                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1840                 return true;
1841         }
1842         return false;
1843 }
1844 EXPORT_SYMBOL(tcp_add_backlog);
1845
1846 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1847 {
1848         struct tcphdr *th = (struct tcphdr *)skb->data;
1849
1850         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1851 }
1852 EXPORT_SYMBOL(tcp_filter);
1853
1854 static void tcp_v4_restore_cb(struct sk_buff *skb)
1855 {
1856         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1857                 sizeof(struct inet_skb_parm));
1858 }
1859
1860 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1861                            const struct tcphdr *th)
1862 {
1863         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1864          * barrier() makes sure compiler wont play fool^Waliasing games.
1865          */
1866         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1867                 sizeof(struct inet_skb_parm));
1868         barrier();
1869
1870         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1871         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1872                                     skb->len - th->doff * 4);
1873         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1874         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1875         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1876         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1877         TCP_SKB_CB(skb)->sacked  = 0;
1878         TCP_SKB_CB(skb)->has_rxtstamp =
1879                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1880 }
1881
1882 /*
1883  *      From tcp_input.c
1884  */
1885
1886 int tcp_v4_rcv(struct sk_buff *skb)
1887 {
1888         struct net *net = dev_net(skb->dev);
1889         struct sk_buff *skb_to_free;
1890         int sdif = inet_sdif(skb);
1891         int dif = inet_iif(skb);
1892         const struct iphdr *iph;
1893         const struct tcphdr *th;
1894         bool refcounted;
1895         struct sock *sk;
1896         int ret;
1897
1898         if (skb->pkt_type != PACKET_HOST)
1899                 goto discard_it;
1900
1901         /* Count it even if it's bad */
1902         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1903
1904         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1905                 goto discard_it;
1906
1907         th = (const struct tcphdr *)skb->data;
1908
1909         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1910                 goto bad_packet;
1911         if (!pskb_may_pull(skb, th->doff * 4))
1912                 goto discard_it;
1913
1914         /* An explanation is required here, I think.
1915          * Packet length and doff are validated by header prediction,
1916          * provided case of th->doff==0 is eliminated.
1917          * So, we defer the checks. */
1918
1919         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1920                 goto csum_error;
1921
1922         th = (const struct tcphdr *)skb->data;
1923         iph = ip_hdr(skb);
1924 lookup:
1925         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1926                                th->dest, sdif, &refcounted);
1927         if (!sk)
1928                 goto no_tcp_socket;
1929
1930 process:
1931         if (sk->sk_state == TCP_TIME_WAIT)
1932                 goto do_time_wait;
1933
1934         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1935                 struct request_sock *req = inet_reqsk(sk);
1936                 bool req_stolen = false;
1937                 struct sock *nsk;
1938
1939                 sk = req->rsk_listener;
1940                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1941                         sk_drops_add(sk, skb);
1942                         reqsk_put(req);
1943                         goto discard_it;
1944                 }
1945                 if (tcp_checksum_complete(skb)) {
1946                         reqsk_put(req);
1947                         goto csum_error;
1948                 }
1949                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1950                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1951                         goto lookup;
1952                 }
1953                 /* We own a reference on the listener, increase it again
1954                  * as we might lose it too soon.
1955                  */
1956                 sock_hold(sk);
1957                 refcounted = true;
1958                 nsk = NULL;
1959                 if (!tcp_filter(sk, skb)) {
1960                         th = (const struct tcphdr *)skb->data;
1961                         iph = ip_hdr(skb);
1962                         tcp_v4_fill_cb(skb, iph, th);
1963                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1964                 }
1965                 if (!nsk) {
1966                         reqsk_put(req);
1967                         if (req_stolen) {
1968                                 /* Another cpu got exclusive access to req
1969                                  * and created a full blown socket.
1970                                  * Try to feed this packet to this socket
1971                                  * instead of discarding it.
1972                                  */
1973                                 tcp_v4_restore_cb(skb);
1974                                 sock_put(sk);
1975                                 goto lookup;
1976                         }
1977                         goto discard_and_relse;
1978                 }
1979                 if (nsk == sk) {
1980                         reqsk_put(req);
1981                         tcp_v4_restore_cb(skb);
1982                 } else if (tcp_child_process(sk, nsk, skb)) {
1983                         tcp_v4_send_reset(nsk, skb);
1984                         goto discard_and_relse;
1985                 } else {
1986                         sock_put(sk);
1987                         return 0;
1988                 }
1989         }
1990         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1991                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1992                 goto discard_and_relse;
1993         }
1994
1995         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1996                 goto discard_and_relse;
1997
1998         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1999                 goto discard_and_relse;
2000
2001         nf_reset_ct(skb);
2002
2003         if (tcp_filter(sk, skb))
2004                 goto discard_and_relse;
2005         th = (const struct tcphdr *)skb->data;
2006         iph = ip_hdr(skb);
2007         tcp_v4_fill_cb(skb, iph, th);
2008
2009         skb->dev = NULL;
2010
2011         if (sk->sk_state == TCP_LISTEN) {
2012                 ret = tcp_v4_do_rcv(sk, skb);
2013                 goto put_and_return;
2014         }
2015
2016         sk_incoming_cpu_update(sk);
2017
2018         bh_lock_sock_nested(sk);
2019         tcp_segs_in(tcp_sk(sk), skb);
2020         ret = 0;
2021         if (!sock_owned_by_user(sk)) {
2022                 skb_to_free = sk->sk_rx_skb_cache;
2023                 sk->sk_rx_skb_cache = NULL;
2024                 ret = tcp_v4_do_rcv(sk, skb);
2025         } else {
2026                 if (tcp_add_backlog(sk, skb))
2027                         goto discard_and_relse;
2028                 skb_to_free = NULL;
2029         }
2030         bh_unlock_sock(sk);
2031         if (skb_to_free)
2032                 __kfree_skb(skb_to_free);
2033
2034 put_and_return:
2035         if (refcounted)
2036                 sock_put(sk);
2037
2038         return ret;
2039
2040 no_tcp_socket:
2041         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2042                 goto discard_it;
2043
2044         tcp_v4_fill_cb(skb, iph, th);
2045
2046         if (tcp_checksum_complete(skb)) {
2047 csum_error:
2048                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2049 bad_packet:
2050                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2051         } else {
2052                 tcp_v4_send_reset(NULL, skb);
2053         }
2054
2055 discard_it:
2056         /* Discard frame. */
2057         kfree_skb(skb);
2058         return 0;
2059
2060 discard_and_relse:
2061         sk_drops_add(sk, skb);
2062         if (refcounted)
2063                 sock_put(sk);
2064         goto discard_it;
2065
2066 do_time_wait:
2067         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2068                 inet_twsk_put(inet_twsk(sk));
2069                 goto discard_it;
2070         }
2071
2072         tcp_v4_fill_cb(skb, iph, th);
2073
2074         if (tcp_checksum_complete(skb)) {
2075                 inet_twsk_put(inet_twsk(sk));
2076                 goto csum_error;
2077         }
2078         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2079         case TCP_TW_SYN: {
2080                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2081                                                         &tcp_hashinfo, skb,
2082                                                         __tcp_hdrlen(th),
2083                                                         iph->saddr, th->source,
2084                                                         iph->daddr, th->dest,
2085                                                         inet_iif(skb),
2086                                                         sdif);
2087                 if (sk2) {
2088                         inet_twsk_deschedule_put(inet_twsk(sk));
2089                         sk = sk2;
2090                         tcp_v4_restore_cb(skb);
2091                         refcounted = false;
2092                         goto process;
2093                 }
2094         }
2095                 /* to ACK */
2096                 fallthrough;
2097         case TCP_TW_ACK:
2098                 tcp_v4_timewait_ack(sk, skb);
2099                 break;
2100         case TCP_TW_RST:
2101                 tcp_v4_send_reset(sk, skb);
2102                 inet_twsk_deschedule_put(inet_twsk(sk));
2103                 goto discard_it;
2104         case TCP_TW_SUCCESS:;
2105         }
2106         goto discard_it;
2107 }
2108
2109 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2110         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2111         .twsk_unique    = tcp_twsk_unique,
2112         .twsk_destructor= tcp_twsk_destructor,
2113 };
2114
2115 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2116 {
2117         struct dst_entry *dst = skb_dst(skb);
2118
2119         if (dst && dst_hold_safe(dst)) {
2120                 sk->sk_rx_dst = dst;
2121                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2122         }
2123 }
2124 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2125
2126 const struct inet_connection_sock_af_ops ipv4_specific = {
2127         .queue_xmit        = ip_queue_xmit,
2128         .send_check        = tcp_v4_send_check,
2129         .rebuild_header    = inet_sk_rebuild_header,
2130         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2131         .conn_request      = tcp_v4_conn_request,
2132         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2133         .net_header_len    = sizeof(struct iphdr),
2134         .setsockopt        = ip_setsockopt,
2135         .getsockopt        = ip_getsockopt,
2136         .addr2sockaddr     = inet_csk_addr2sockaddr,
2137         .sockaddr_len      = sizeof(struct sockaddr_in),
2138         .mtu_reduced       = tcp_v4_mtu_reduced,
2139 };
2140 EXPORT_SYMBOL(ipv4_specific);
2141
2142 #ifdef CONFIG_TCP_MD5SIG
2143 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2144         .md5_lookup             = tcp_v4_md5_lookup,
2145         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2146         .md5_parse              = tcp_v4_parse_md5_keys,
2147 };
2148 #endif
2149
2150 /* NOTE: A lot of things set to zero explicitly by call to
2151  *       sk_alloc() so need not be done here.
2152  */
2153 static int tcp_v4_init_sock(struct sock *sk)
2154 {
2155         struct inet_connection_sock *icsk = inet_csk(sk);
2156
2157         tcp_init_sock(sk);
2158
2159         icsk->icsk_af_ops = &ipv4_specific;
2160
2161 #ifdef CONFIG_TCP_MD5SIG
2162         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2163 #endif
2164
2165         return 0;
2166 }
2167
2168 void tcp_v4_destroy_sock(struct sock *sk)
2169 {
2170         struct tcp_sock *tp = tcp_sk(sk);
2171
2172         trace_tcp_destroy_sock(sk);
2173
2174         tcp_clear_xmit_timers(sk);
2175
2176         tcp_cleanup_congestion_control(sk);
2177
2178         tcp_cleanup_ulp(sk);
2179
2180         /* Cleanup up the write buffer. */
2181         tcp_write_queue_purge(sk);
2182
2183         /* Check if we want to disable active TFO */
2184         tcp_fastopen_active_disable_ofo_check(sk);
2185
2186         /* Cleans up our, hopefully empty, out_of_order_queue. */
2187         skb_rbtree_purge(&tp->out_of_order_queue);
2188
2189 #ifdef CONFIG_TCP_MD5SIG
2190         /* Clean up the MD5 key list, if any */
2191         if (tp->md5sig_info) {
2192                 tcp_clear_md5_list(sk);
2193                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2194                 tp->md5sig_info = NULL;
2195         }
2196 #endif
2197
2198         /* Clean up a referenced TCP bind bucket. */
2199         if (inet_csk(sk)->icsk_bind_hash)
2200                 inet_put_port(sk);
2201
2202         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2203
2204         /* If socket is aborted during connect operation */
2205         tcp_free_fastopen_req(tp);
2206         tcp_fastopen_destroy_cipher(sk);
2207         tcp_saved_syn_free(tp);
2208
2209         sk_sockets_allocated_dec(sk);
2210 }
2211 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2212
2213 #ifdef CONFIG_PROC_FS
2214 /* Proc filesystem TCP sock list dumping. */
2215
2216 /*
2217  * Get next listener socket follow cur.  If cur is NULL, get first socket
2218  * starting from bucket given in st->bucket; when st->bucket is zero the
2219  * very first socket in the hash table is returned.
2220  */
2221 static void *listening_get_next(struct seq_file *seq, void *cur)
2222 {
2223         struct tcp_seq_afinfo *afinfo;
2224         struct tcp_iter_state *st = seq->private;
2225         struct net *net = seq_file_net(seq);
2226         struct inet_listen_hashbucket *ilb;
2227         struct hlist_nulls_node *node;
2228         struct sock *sk = cur;
2229
2230         if (st->bpf_seq_afinfo)
2231                 afinfo = st->bpf_seq_afinfo;
2232         else
2233                 afinfo = PDE_DATA(file_inode(seq->file));
2234
2235         if (!sk) {
2236 get_head:
2237                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2238                 spin_lock(&ilb->lock);
2239                 sk = sk_nulls_head(&ilb->nulls_head);
2240                 st->offset = 0;
2241                 goto get_sk;
2242         }
2243         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2244         ++st->num;
2245         ++st->offset;
2246
2247         sk = sk_nulls_next(sk);
2248 get_sk:
2249         sk_nulls_for_each_from(sk, node) {
2250                 if (!net_eq(sock_net(sk), net))
2251                         continue;
2252                 if (afinfo->family == AF_UNSPEC ||
2253                     sk->sk_family == afinfo->family)
2254                         return sk;
2255         }
2256         spin_unlock(&ilb->lock);
2257         st->offset = 0;
2258         if (++st->bucket < INET_LHTABLE_SIZE)
2259                 goto get_head;
2260         return NULL;
2261 }
2262
2263 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2264 {
2265         struct tcp_iter_state *st = seq->private;
2266         void *rc;
2267
2268         st->bucket = 0;
2269         st->offset = 0;
2270         rc = listening_get_next(seq, NULL);
2271
2272         while (rc && *pos) {
2273                 rc = listening_get_next(seq, rc);
2274                 --*pos;
2275         }
2276         return rc;
2277 }
2278
2279 static inline bool empty_bucket(const struct tcp_iter_state *st)
2280 {
2281         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2282 }
2283
2284 /*
2285  * Get first established socket starting from bucket given in st->bucket.
2286  * If st->bucket is zero, the very first socket in the hash is returned.
2287  */
2288 static void *established_get_first(struct seq_file *seq)
2289 {
2290         struct tcp_seq_afinfo *afinfo;
2291         struct tcp_iter_state *st = seq->private;
2292         struct net *net = seq_file_net(seq);
2293         void *rc = NULL;
2294
2295         if (st->bpf_seq_afinfo)
2296                 afinfo = st->bpf_seq_afinfo;
2297         else
2298                 afinfo = PDE_DATA(file_inode(seq->file));
2299
2300         st->offset = 0;
2301         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2302                 struct sock *sk;
2303                 struct hlist_nulls_node *node;
2304                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2305
2306                 /* Lockless fast path for the common case of empty buckets */
2307                 if (empty_bucket(st))
2308                         continue;
2309
2310                 spin_lock_bh(lock);
2311                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2312                         if ((afinfo->family != AF_UNSPEC &&
2313                              sk->sk_family != afinfo->family) ||
2314                             !net_eq(sock_net(sk), net)) {
2315                                 continue;
2316                         }
2317                         rc = sk;
2318                         goto out;
2319                 }
2320                 spin_unlock_bh(lock);
2321         }
2322 out:
2323         return rc;
2324 }
2325
2326 static void *established_get_next(struct seq_file *seq, void *cur)
2327 {
2328         struct tcp_seq_afinfo *afinfo;
2329         struct sock *sk = cur;
2330         struct hlist_nulls_node *node;
2331         struct tcp_iter_state *st = seq->private;
2332         struct net *net = seq_file_net(seq);
2333
2334         if (st->bpf_seq_afinfo)
2335                 afinfo = st->bpf_seq_afinfo;
2336         else
2337                 afinfo = PDE_DATA(file_inode(seq->file));
2338
2339         ++st->num;
2340         ++st->offset;
2341
2342         sk = sk_nulls_next(sk);
2343
2344         sk_nulls_for_each_from(sk, node) {
2345                 if ((afinfo->family == AF_UNSPEC ||
2346                      sk->sk_family == afinfo->family) &&
2347                     net_eq(sock_net(sk), net))
2348                         return sk;
2349         }
2350
2351         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2352         ++st->bucket;
2353         return established_get_first(seq);
2354 }
2355
2356 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2357 {
2358         struct tcp_iter_state *st = seq->private;
2359         void *rc;
2360
2361         st->bucket = 0;
2362         rc = established_get_first(seq);
2363
2364         while (rc && pos) {
2365                 rc = established_get_next(seq, rc);
2366                 --pos;
2367         }
2368         return rc;
2369 }
2370
2371 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2372 {
2373         void *rc;
2374         struct tcp_iter_state *st = seq->private;
2375
2376         st->state = TCP_SEQ_STATE_LISTENING;
2377         rc        = listening_get_idx(seq, &pos);
2378
2379         if (!rc) {
2380                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2381                 rc        = established_get_idx(seq, pos);
2382         }
2383
2384         return rc;
2385 }
2386
2387 static void *tcp_seek_last_pos(struct seq_file *seq)
2388 {
2389         struct tcp_iter_state *st = seq->private;
2390         int offset = st->offset;
2391         int orig_num = st->num;
2392         void *rc = NULL;
2393
2394         switch (st->state) {
2395         case TCP_SEQ_STATE_LISTENING:
2396                 if (st->bucket >= INET_LHTABLE_SIZE)
2397                         break;
2398                 st->state = TCP_SEQ_STATE_LISTENING;
2399                 rc = listening_get_next(seq, NULL);
2400                 while (offset-- && rc)
2401                         rc = listening_get_next(seq, rc);
2402                 if (rc)
2403                         break;
2404                 st->bucket = 0;
2405                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2406                 fallthrough;
2407         case TCP_SEQ_STATE_ESTABLISHED:
2408                 if (st->bucket > tcp_hashinfo.ehash_mask)
2409                         break;
2410                 rc = established_get_first(seq);
2411                 while (offset-- && rc)
2412                         rc = established_get_next(seq, rc);
2413         }
2414
2415         st->num = orig_num;
2416
2417         return rc;
2418 }
2419
2420 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2421 {
2422         struct tcp_iter_state *st = seq->private;
2423         void *rc;
2424
2425         if (*pos && *pos == st->last_pos) {
2426                 rc = tcp_seek_last_pos(seq);
2427                 if (rc)
2428                         goto out;
2429         }
2430
2431         st->state = TCP_SEQ_STATE_LISTENING;
2432         st->num = 0;
2433         st->bucket = 0;
2434         st->offset = 0;
2435         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2436
2437 out:
2438         st->last_pos = *pos;
2439         return rc;
2440 }
2441 EXPORT_SYMBOL(tcp_seq_start);
2442
2443 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2444 {
2445         struct tcp_iter_state *st = seq->private;
2446         void *rc = NULL;
2447
2448         if (v == SEQ_START_TOKEN) {
2449                 rc = tcp_get_idx(seq, 0);
2450                 goto out;
2451         }
2452
2453         switch (st->state) {
2454         case TCP_SEQ_STATE_LISTENING:
2455                 rc = listening_get_next(seq, v);
2456                 if (!rc) {
2457                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2458                         st->bucket = 0;
2459                         st->offset = 0;
2460                         rc        = established_get_first(seq);
2461                 }
2462                 break;
2463         case TCP_SEQ_STATE_ESTABLISHED:
2464                 rc = established_get_next(seq, v);
2465                 break;
2466         }
2467 out:
2468         ++*pos;
2469         st->last_pos = *pos;
2470         return rc;
2471 }
2472 EXPORT_SYMBOL(tcp_seq_next);
2473
2474 void tcp_seq_stop(struct seq_file *seq, void *v)
2475 {
2476         struct tcp_iter_state *st = seq->private;
2477
2478         switch (st->state) {
2479         case TCP_SEQ_STATE_LISTENING:
2480                 if (v != SEQ_START_TOKEN)
2481                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2482                 break;
2483         case TCP_SEQ_STATE_ESTABLISHED:
2484                 if (v)
2485                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2486                 break;
2487         }
2488 }
2489 EXPORT_SYMBOL(tcp_seq_stop);
2490
2491 static void get_openreq4(const struct request_sock *req,
2492                          struct seq_file *f, int i)
2493 {
2494         const struct inet_request_sock *ireq = inet_rsk(req);
2495         long delta = req->rsk_timer.expires - jiffies;
2496
2497         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2498                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2499                 i,
2500                 ireq->ir_loc_addr,
2501                 ireq->ir_num,
2502                 ireq->ir_rmt_addr,
2503                 ntohs(ireq->ir_rmt_port),
2504                 TCP_SYN_RECV,
2505                 0, 0, /* could print option size, but that is af dependent. */
2506                 1,    /* timers active (only the expire timer) */
2507                 jiffies_delta_to_clock_t(delta),
2508                 req->num_timeout,
2509                 from_kuid_munged(seq_user_ns(f),
2510                                  sock_i_uid(req->rsk_listener)),
2511                 0,  /* non standard timer */
2512                 0, /* open_requests have no inode */
2513                 0,
2514                 req);
2515 }
2516
2517 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2518 {
2519         int timer_active;
2520         unsigned long timer_expires;
2521         const struct tcp_sock *tp = tcp_sk(sk);
2522         const struct inet_connection_sock *icsk = inet_csk(sk);
2523         const struct inet_sock *inet = inet_sk(sk);
2524         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2525         __be32 dest = inet->inet_daddr;
2526         __be32 src = inet->inet_rcv_saddr;
2527         __u16 destp = ntohs(inet->inet_dport);
2528         __u16 srcp = ntohs(inet->inet_sport);
2529         int rx_queue;
2530         int state;
2531
2532         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2533             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2534             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2535                 timer_active    = 1;
2536                 timer_expires   = icsk->icsk_timeout;
2537         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2538                 timer_active    = 4;
2539                 timer_expires   = icsk->icsk_timeout;
2540         } else if (timer_pending(&sk->sk_timer)) {
2541                 timer_active    = 2;
2542                 timer_expires   = sk->sk_timer.expires;
2543         } else {
2544                 timer_active    = 0;
2545                 timer_expires = jiffies;
2546         }
2547
2548         state = inet_sk_state_load(sk);
2549         if (state == TCP_LISTEN)
2550                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2551         else
2552                 /* Because we don't lock the socket,
2553                  * we might find a transient negative value.
2554                  */
2555                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2556                                       READ_ONCE(tp->copied_seq), 0);
2557
2558         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2559                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2560                 i, src, srcp, dest, destp, state,
2561                 READ_ONCE(tp->write_seq) - tp->snd_una,
2562                 rx_queue,
2563                 timer_active,
2564                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2565                 icsk->icsk_retransmits,
2566                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2567                 icsk->icsk_probes_out,
2568                 sock_i_ino(sk),
2569                 refcount_read(&sk->sk_refcnt), sk,
2570                 jiffies_to_clock_t(icsk->icsk_rto),
2571                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2572                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2573                 tp->snd_cwnd,
2574                 state == TCP_LISTEN ?
2575                     fastopenq->max_qlen :
2576                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2577 }
2578
2579 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2580                                struct seq_file *f, int i)
2581 {
2582         long delta = tw->tw_timer.expires - jiffies;
2583         __be32 dest, src;
2584         __u16 destp, srcp;
2585
2586         dest  = tw->tw_daddr;
2587         src   = tw->tw_rcv_saddr;
2588         destp = ntohs(tw->tw_dport);
2589         srcp  = ntohs(tw->tw_sport);
2590
2591         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2592                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2593                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2594                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2595                 refcount_read(&tw->tw_refcnt), tw);
2596 }
2597
2598 #define TMPSZ 150
2599
2600 static int tcp4_seq_show(struct seq_file *seq, void *v)
2601 {
2602         struct tcp_iter_state *st;
2603         struct sock *sk = v;
2604
2605         seq_setwidth(seq, TMPSZ - 1);
2606         if (v == SEQ_START_TOKEN) {
2607                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2608                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2609                            "inode");
2610                 goto out;
2611         }
2612         st = seq->private;
2613
2614         if (sk->sk_state == TCP_TIME_WAIT)
2615                 get_timewait4_sock(v, seq, st->num);
2616         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2617                 get_openreq4(v, seq, st->num);
2618         else
2619                 get_tcp4_sock(v, seq, st->num);
2620 out:
2621         seq_pad(seq, '\n');
2622         return 0;
2623 }
2624
2625 #ifdef CONFIG_BPF_SYSCALL
2626 struct bpf_iter__tcp {
2627         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2628         __bpf_md_ptr(struct sock_common *, sk_common);
2629         uid_t uid __aligned(8);
2630 };
2631
2632 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2633                              struct sock_common *sk_common, uid_t uid)
2634 {
2635         struct bpf_iter__tcp ctx;
2636
2637         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2638         ctx.meta = meta;
2639         ctx.sk_common = sk_common;
2640         ctx.uid = uid;
2641         return bpf_iter_run_prog(prog, &ctx);
2642 }
2643
2644 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2645 {
2646         struct bpf_iter_meta meta;
2647         struct bpf_prog *prog;
2648         struct sock *sk = v;
2649         uid_t uid;
2650
2651         if (v == SEQ_START_TOKEN)
2652                 return 0;
2653
2654         if (sk->sk_state == TCP_TIME_WAIT) {
2655                 uid = 0;
2656         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2657                 const struct request_sock *req = v;
2658
2659                 uid = from_kuid_munged(seq_user_ns(seq),
2660                                        sock_i_uid(req->rsk_listener));
2661         } else {
2662                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2663         }
2664
2665         meta.seq = seq;
2666         prog = bpf_iter_get_info(&meta, false);
2667         return tcp_prog_seq_show(prog, &meta, v, uid);
2668 }
2669
2670 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2671 {
2672         struct bpf_iter_meta meta;
2673         struct bpf_prog *prog;
2674
2675         if (!v) {
2676                 meta.seq = seq;
2677                 prog = bpf_iter_get_info(&meta, true);
2678                 if (prog)
2679                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2680         }
2681
2682         tcp_seq_stop(seq, v);
2683 }
2684
2685 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2686         .show           = bpf_iter_tcp_seq_show,
2687         .start          = tcp_seq_start,
2688         .next           = tcp_seq_next,
2689         .stop           = bpf_iter_tcp_seq_stop,
2690 };
2691 #endif
2692
2693 static const struct seq_operations tcp4_seq_ops = {
2694         .show           = tcp4_seq_show,
2695         .start          = tcp_seq_start,
2696         .next           = tcp_seq_next,
2697         .stop           = tcp_seq_stop,
2698 };
2699
2700 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2701         .family         = AF_INET,
2702 };
2703
2704 static int __net_init tcp4_proc_init_net(struct net *net)
2705 {
2706         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2707                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2708                 return -ENOMEM;
2709         return 0;
2710 }
2711
2712 static void __net_exit tcp4_proc_exit_net(struct net *net)
2713 {
2714         remove_proc_entry("tcp", net->proc_net);
2715 }
2716
2717 static struct pernet_operations tcp4_net_ops = {
2718         .init = tcp4_proc_init_net,
2719         .exit = tcp4_proc_exit_net,
2720 };
2721
2722 int __init tcp4_proc_init(void)
2723 {
2724         return register_pernet_subsys(&tcp4_net_ops);
2725 }
2726
2727 void tcp4_proc_exit(void)
2728 {
2729         unregister_pernet_subsys(&tcp4_net_ops);
2730 }
2731 #endif /* CONFIG_PROC_FS */
2732
2733 struct proto tcp_prot = {
2734         .name                   = "TCP",
2735         .owner                  = THIS_MODULE,
2736         .close                  = tcp_close,
2737         .pre_connect            = tcp_v4_pre_connect,
2738         .connect                = tcp_v4_connect,
2739         .disconnect             = tcp_disconnect,
2740         .accept                 = inet_csk_accept,
2741         .ioctl                  = tcp_ioctl,
2742         .init                   = tcp_v4_init_sock,
2743         .destroy                = tcp_v4_destroy_sock,
2744         .shutdown               = tcp_shutdown,
2745         .setsockopt             = tcp_setsockopt,
2746         .getsockopt             = tcp_getsockopt,
2747         .keepalive              = tcp_set_keepalive,
2748         .recvmsg                = tcp_recvmsg,
2749         .sendmsg                = tcp_sendmsg,
2750         .sendpage               = tcp_sendpage,
2751         .backlog_rcv            = tcp_v4_do_rcv,
2752         .release_cb             = tcp_release_cb,
2753         .hash                   = inet_hash,
2754         .unhash                 = inet_unhash,
2755         .get_port               = inet_csk_get_port,
2756         .enter_memory_pressure  = tcp_enter_memory_pressure,
2757         .leave_memory_pressure  = tcp_leave_memory_pressure,
2758         .stream_memory_free     = tcp_stream_memory_free,
2759         .sockets_allocated      = &tcp_sockets_allocated,
2760         .orphan_count           = &tcp_orphan_count,
2761         .memory_allocated       = &tcp_memory_allocated,
2762         .memory_pressure        = &tcp_memory_pressure,
2763         .sysctl_mem             = sysctl_tcp_mem,
2764         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2765         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2766         .max_header             = MAX_TCP_HEADER,
2767         .obj_size               = sizeof(struct tcp_sock),
2768         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2769         .twsk_prot              = &tcp_timewait_sock_ops,
2770         .rsk_prot               = &tcp_request_sock_ops,
2771         .h.hashinfo             = &tcp_hashinfo,
2772         .no_autobind            = true,
2773         .diag_destroy           = tcp_abort,
2774 };
2775 EXPORT_SYMBOL(tcp_prot);
2776
2777 static void __net_exit tcp_sk_exit(struct net *net)
2778 {
2779         int cpu;
2780
2781         if (net->ipv4.tcp_congestion_control)
2782                 bpf_module_put(net->ipv4.tcp_congestion_control,
2783                                net->ipv4.tcp_congestion_control->owner);
2784
2785         for_each_possible_cpu(cpu)
2786                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2787         free_percpu(net->ipv4.tcp_sk);
2788 }
2789
2790 static int __net_init tcp_sk_init(struct net *net)
2791 {
2792         int res, cpu, cnt;
2793
2794         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2795         if (!net->ipv4.tcp_sk)
2796                 return -ENOMEM;
2797
2798         for_each_possible_cpu(cpu) {
2799                 struct sock *sk;
2800
2801                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2802                                            IPPROTO_TCP, net);
2803                 if (res)
2804                         goto fail;
2805                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2806
2807                 /* Please enforce IP_DF and IPID==0 for RST and
2808                  * ACK sent in SYN-RECV and TIME-WAIT state.
2809                  */
2810                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2811
2812                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2813         }
2814
2815         net->ipv4.sysctl_tcp_ecn = 2;
2816         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2817
2818         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2819         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2820         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2821         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2822         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2823
2824         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2825         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2826         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2827
2828         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2829         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2830         net->ipv4.sysctl_tcp_syncookies = 1;
2831         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2832         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2833         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2834         net->ipv4.sysctl_tcp_orphan_retries = 0;
2835         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2836         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2837         net->ipv4.sysctl_tcp_tw_reuse = 2;
2838         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2839
2840         cnt = tcp_hashinfo.ehash_mask + 1;
2841         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2842         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2843
2844         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2845         net->ipv4.sysctl_tcp_sack = 1;
2846         net->ipv4.sysctl_tcp_window_scaling = 1;
2847         net->ipv4.sysctl_tcp_timestamps = 1;
2848         net->ipv4.sysctl_tcp_early_retrans = 3;
2849         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2850         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2851         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2852         net->ipv4.sysctl_tcp_max_reordering = 300;
2853         net->ipv4.sysctl_tcp_dsack = 1;
2854         net->ipv4.sysctl_tcp_app_win = 31;
2855         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2856         net->ipv4.sysctl_tcp_frto = 2;
2857         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2858         /* This limits the percentage of the congestion window which we
2859          * will allow a single TSO frame to consume.  Building TSO frames
2860          * which are too large can cause TCP streams to be bursty.
2861          */
2862         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2863         /* Default TSQ limit of 16 TSO segments */
2864         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2865         /* rfc5961 challenge ack rate limiting */
2866         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2867         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2868         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2869         net->ipv4.sysctl_tcp_autocorking = 1;
2870         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2871         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2872         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2873         if (net != &init_net) {
2874                 memcpy(net->ipv4.sysctl_tcp_rmem,
2875                        init_net.ipv4.sysctl_tcp_rmem,
2876                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2877                 memcpy(net->ipv4.sysctl_tcp_wmem,
2878                        init_net.ipv4.sysctl_tcp_wmem,
2879                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2880         }
2881         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2882         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2883         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2884         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2885         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2886         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2887         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2888
2889         /* Reno is always built in */
2890         if (!net_eq(net, &init_net) &&
2891             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2892                                init_net.ipv4.tcp_congestion_control->owner))
2893                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2894         else
2895                 net->ipv4.tcp_congestion_control = &tcp_reno;
2896
2897         return 0;
2898 fail:
2899         tcp_sk_exit(net);
2900
2901         return res;
2902 }
2903
2904 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2905 {
2906         struct net *net;
2907
2908         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2909
2910         list_for_each_entry(net, net_exit_list, exit_list)
2911                 tcp_fastopen_ctx_destroy(net);
2912 }
2913
2914 static struct pernet_operations __net_initdata tcp_sk_ops = {
2915        .init       = tcp_sk_init,
2916        .exit       = tcp_sk_exit,
2917        .exit_batch = tcp_sk_exit_batch,
2918 };
2919
2920 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2921 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2922                      struct sock_common *sk_common, uid_t uid)
2923
2924 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2925 {
2926         struct tcp_iter_state *st = priv_data;
2927         struct tcp_seq_afinfo *afinfo;
2928         int ret;
2929
2930         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2931         if (!afinfo)
2932                 return -ENOMEM;
2933
2934         afinfo->family = AF_UNSPEC;
2935         st->bpf_seq_afinfo = afinfo;
2936         ret = bpf_iter_init_seq_net(priv_data, aux);
2937         if (ret)
2938                 kfree(afinfo);
2939         return ret;
2940 }
2941
2942 static void bpf_iter_fini_tcp(void *priv_data)
2943 {
2944         struct tcp_iter_state *st = priv_data;
2945
2946         kfree(st->bpf_seq_afinfo);
2947         bpf_iter_fini_seq_net(priv_data);
2948 }
2949
2950 static const struct bpf_iter_seq_info tcp_seq_info = {
2951         .seq_ops                = &bpf_iter_tcp_seq_ops,
2952         .init_seq_private       = bpf_iter_init_tcp,
2953         .fini_seq_private       = bpf_iter_fini_tcp,
2954         .seq_priv_size          = sizeof(struct tcp_iter_state),
2955 };
2956
2957 static struct bpf_iter_reg tcp_reg_info = {
2958         .target                 = "tcp",
2959         .ctx_arg_info_size      = 1,
2960         .ctx_arg_info           = {
2961                 { offsetof(struct bpf_iter__tcp, sk_common),
2962                   PTR_TO_BTF_ID_OR_NULL },
2963         },
2964         .seq_info               = &tcp_seq_info,
2965 };
2966
2967 static void __init bpf_iter_register(void)
2968 {
2969         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2970         if (bpf_iter_reg_target(&tcp_reg_info))
2971                 pr_warn("Warning: could not register bpf iterator tcp\n");
2972 }
2973
2974 #endif
2975
2976 void __init tcp_v4_init(void)
2977 {
2978         if (register_pernet_subsys(&tcp_sk_ops))
2979                 panic("Failed to create the TCP control socket.\n");
2980
2981 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2982         bpf_iter_register();
2983 #endif
2984 }