net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118         if (reuse == 2) {
 119                 /* Still does not detect *everything* that goes through
 120                  * lo, since we require a loopback src or dst address
 121                  * or direct binding to 'lo' interface.
 122                  */
 123                 bool loopback = false;
 124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                         loopback = true;
 126 #if IS_ENABLED(CONFIG_IPV6)
 127                 if (tw->tw_family == AF_INET6) {
 128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                 loopback = true;
 135                 } else
 136 #endif
 137                 {
 138                         if (ipv4_is_loopback(tw->tw_daddr) ||
 139                             ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                 loopback = true;
 141                 }
 142                 if (!loopback)
 143                         reuse = 0;
 144         }
 145
 146         /* With PAWS, it is safe from the viewpoint
 147            of data integrity. Even without PAWS it is safe provided sequence
 148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150            Actually, the idea is close to VJ's one, only timestamp cache is
 151            held not per host, but per port pair and TW bucket is used as state
 152            holder.
 153
 154            If TW bucket has been already destroyed we fall back to VJ's scheme
 155            and use initial timestamp retrieved from peer table.
 156          */
 157         if (tcptw->tw_ts_recent_stamp &&
 158             (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 159                 /* In case of repair and re-using TIME-WAIT sockets we still
 160                  * want to be sure that it is safe as above but honor the
 161                  * sequence numbers and time stamps set as part of the repair
 162                  * process.
 163                  *
 164                  * Without this check re-using a TIME-WAIT socket with TCP
 165                  * repair would accumulate a -1 on the repair assigned
 166                  * sequence number. The first time it is reused the sequence
 167                  * is -1, the second time -2, etc. This fixes that issue
 168                  * without appearing to create any others.
 169                  */
 170                 if (likely(!tp->repair)) {
 171                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 172                         if (tp->write_seq == 0)
 173                                 tp->write_seq = 1;
 174                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 175                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 176                 }
 177                 sock_hold(sktw);
 178                 return 1;
 179         }
 180
 181         return 0;
 182 }
 183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 184
 185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 186                               int addr_len)
 187 {
 188         /* This check is replicated from tcp_v4_connect() and intended to
 189          * prevent BPF program called below from accessing bytes that are out
 190          * of the bound specified by user in addr_len.
 191          */
 192         if (addr_len < sizeof(struct sockaddr_in))
 193                 return -EINVAL;
 194
 195         sock_owned_by_me(sk);
 196
 197         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 198 }
 199
 200 /* This will initiate an outgoing connection. */
 201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 202 {
 203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 204         struct inet_sock *inet = inet_sk(sk);
 205         struct tcp_sock *tp = tcp_sk(sk);
 206         __be16 orig_sport, orig_dport;
 207         __be32 daddr, nexthop;
 208         struct flowi4 *fl4;
 209         struct rtable *rt;
 210         int err;
 211         struct ip_options_rcu *inet_opt;
 212         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 213
 214         if (addr_len < sizeof(struct sockaddr_in))
 215                 return -EINVAL;
 216
 217         if (usin->sin_family != AF_INET)
 218                 return -EAFNOSUPPORT;
 219
 220         nexthop = daddr = usin->sin_addr.s_addr;
 221         inet_opt = rcu_dereference_protected(inet->inet_opt,
 222                                              lockdep_sock_is_held(sk));
 223         if (inet_opt && inet_opt->opt.srr) {
 224                 if (!daddr)
 225                         return -EINVAL;
 226                 nexthop = inet_opt->opt.faddr;
 227         }
 228
 229         orig_sport = inet->inet_sport;
 230         orig_dport = usin->sin_port;
 231         fl4 = &inet->cork.fl.u.ip4;
 232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 233                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 234                               IPPROTO_TCP,
 235                               orig_sport, orig_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 if (err == -ENETUNREACH)
 239                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 240                 return err;
 241         }
 242
 243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 244                 ip_rt_put(rt);
 245                 return -ENETUNREACH;
 246         }
 247
 248         if (!inet_opt || !inet_opt->opt.srr)
 249                 daddr = fl4->daddr;
 250
 251         if (!inet->inet_saddr)
 252                 inet->inet_saddr = fl4->saddr;
 253         sk_rcv_saddr_set(sk, inet->inet_saddr);
 254
 255         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 256                 /* Reset inherited state */
 257                 tp->rx_opt.ts_recent       = 0;
 258                 tp->rx_opt.ts_recent_stamp = 0;
 259                 if (likely(!tp->repair))
 260                         tp->write_seq      = 0;
 261         }
 262
 263         inet->inet_dport = usin->sin_port;
 264         sk_daddr_set(sk, daddr);
 265
 266         inet_csk(sk)->icsk_ext_hdr_len = 0;
 267         if (inet_opt)
 268                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 269
 270         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 271
 272         /* Socket identity is still unknown (sport may be zero).
 273          * However we set state to SYN-SENT and not releasing socket
 274          * lock select source port, enter ourselves into the hash tables and
 275          * complete initialization after this.
 276          */
 277         tcp_set_state(sk, TCP_SYN_SENT);
 278         err = inet_hash_connect(tcp_death_row, sk);
 279         if (err)
 280                 goto failure;
 281
 282         sk_set_txhash(sk);
 283
 284         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 285                                inet->inet_sport, inet->inet_dport, sk);
 286         if (IS_ERR(rt)) {
 287                 err = PTR_ERR(rt);
 288                 rt = NULL;
 289                 goto failure;
 290         }
 291         /* OK, now commit destination to socket.  */
 292         sk->sk_gso_type = SKB_GSO_TCPV4;
 293         sk_setup_caps(sk, &rt->dst);
 294         rt = NULL;
 295
 296         if (likely(!tp->repair)) {
 297                 if (!tp->write_seq)
 298                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 299                                                        inet->inet_daddr,
 300                                                        inet->inet_sport,
 301                                                        usin->sin_port);
 302                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 303                                                  inet->inet_saddr,
 304                                                  inet->inet_daddr);
 305         }
 306
 307         inet->inet_id = tp->write_seq ^ jiffies;
 308
 309         if (tcp_fastopen_defer_connect(sk, &err))
 310                 return err;
 311         if (err)
 312                 goto failure;
 313
 314         err = tcp_connect(sk);
 315
 316         if (err)
 317                 goto failure;
 318
 319         return 0;
 320
 321 failure:
 322         /*
 323          * This unhashes the socket and releases the local port,
 324          * if necessary.
 325          */
 326         tcp_set_state(sk, TCP_CLOSE);
 327         ip_rt_put(rt);
 328         sk->sk_route_caps = 0;
 329         inet->inet_dport = 0;
 330         return err;
 331 }
 332 EXPORT_SYMBOL(tcp_v4_connect);
 333
 334 /*
 335  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 336  * It can be called through tcp_release_cb() if socket was owned by user
 337  * at the time tcp_v4_err() was called to handle ICMP message.
 338  */
 339 void tcp_v4_mtu_reduced(struct sock *sk)
 340 {
 341         struct inet_sock *inet = inet_sk(sk);
 342         struct dst_entry *dst;
 343         u32 mtu;
 344
 345         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 346                 return;
 347         mtu = tcp_sk(sk)->mtu_info;
 348         dst = inet_csk_update_pmtu(sk, mtu);
 349         if (!dst)
 350                 return;
 351
 352         /* Something is about to be wrong... Remember soft error
 353          * for the case, if this connection will not able to recover.
 354          */
 355         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 356                 sk->sk_err_soft = EMSGSIZE;
 357
 358         mtu = dst_mtu(dst);
 359
 360         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 361             ip_sk_accept_pmtu(sk) &&
 362             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 363                 tcp_sync_mss(sk, mtu);
 364
 365                 /* Resend the TCP packet because it's
 366                  * clear that the old packet has been
 367                  * dropped. This is the new "fast" path mtu
 368                  * discovery.
 369                  */
 370                 tcp_simple_retransmit(sk);
 371         } /* else let the usual retransmit timer handle it */
 372 }
 373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 374
 375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 376 {
 377         struct dst_entry *dst = __sk_dst_check(sk, 0);
 378
 379         if (dst)
 380                 dst->ops->redirect(dst, sk, skb);
 381 }
 382
 383
 384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 386 {
 387         struct request_sock *req = inet_reqsk(sk);
 388         struct net *net = sock_net(sk);
 389
 390         /* ICMPs are not backlogged, hence we cannot get
 391          * an established socket here.
 392          */
 393         if (seq != tcp_rsk(req)->snt_isn) {
 394                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 395         } else if (abort) {
 396                 /*
 397                  * Still in SYN_RECV, just remove it silently.
 398                  * There is no good way to pass the error to the newly
 399                  * created socket, and POSIX does not want network
 400                  * errors returned from accept().
 401                  */
 402                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 403                 tcp_listendrop(req->rsk_listener);
 404         }
 405         reqsk_put(req);
 406 }
 407 EXPORT_SYMBOL(tcp_req_err);
 408
 409 /*
 410  * This routine is called by the ICMP module when it gets some
 411  * sort of error condition.  If err < 0 then the socket should
 412  * be closed and the error returned to the user.  If err > 0
 413  * it's just the icmp type << 8 | icmp code.  After adjustment
 414  * header points to the first 8 bytes of the tcp header.  We need
 415  * to find the appropriate port.
 416  *
 417  * The locking strategy used here is very "optimistic". When
 418  * someone else accesses the socket the ICMP is just dropped
 419  * and for some paths there is no check at all.
 420  * A more general error queue to queue errors for later handling
 421  * is probably better.
 422  *
 423  */
 424
 425 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 426 {
 427         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 428         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 429         struct inet_connection_sock *icsk;
 430         struct tcp_sock *tp;
 431         struct inet_sock *inet;
 432         const int type = icmp_hdr(icmp_skb)->type;
 433         const int code = icmp_hdr(icmp_skb)->code;
 434         struct sock *sk;
 435         struct sk_buff *skb;
 436         struct request_sock *fastopen;
 437         u32 seq, snd_una;
 438         s32 remaining;
 439         u32 delta_us;
 440         int err;
 441         struct net *net = dev_net(icmp_skb->dev);
 442
 443         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 444                                        th->dest, iph->saddr, ntohs(th->source),
 445                                        inet_iif(icmp_skb), 0);
 446         if (!sk) {
 447                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 448                 return;
 449         }
 450         if (sk->sk_state == TCP_TIME_WAIT) {
 451                 inet_twsk_put(inet_twsk(sk));
 452                 return;
 453         }
 454         seq = ntohl(th->seq);
 455         if (sk->sk_state == TCP_NEW_SYN_RECV)
 456                 return tcp_req_err(sk, seq,
 457                                   type == ICMP_PARAMETERPROB ||
 458                                   type == ICMP_TIME_EXCEEDED ||
 459                                   (type == ICMP_DEST_UNREACH &&
 460                                    (code == ICMP_NET_UNREACH ||
 461                                     code == ICMP_HOST_UNREACH)));
 462
 463         bh_lock_sock(sk);
 464         /* If too many ICMPs get dropped on busy
 465          * servers this needs to be solved differently.
 466          * We do take care of PMTU discovery (RFC1191) special case :
 467          * we can receive locally generated ICMP messages while socket is held.
 468          */
 469         if (sock_owned_by_user(sk)) {
 470                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 471                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 472         }
 473         if (sk->sk_state == TCP_CLOSE)
 474                 goto out;
 475
 476         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 477                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 478                 goto out;
 479         }
 480
 481         icsk = inet_csk(sk);
 482         tp = tcp_sk(sk);
 483         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 484         fastopen = tp->fastopen_rsk;
 485         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 486         if (sk->sk_state != TCP_LISTEN &&
 487             !between(seq, snd_una, tp->snd_nxt)) {
 488                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                 goto out;
 490         }
 491
 492         switch (type) {
 493         case ICMP_REDIRECT:
 494                 if (!sock_owned_by_user(sk))
 495                         do_redirect(icmp_skb, sk);
 496                 goto out;
 497         case ICMP_SOURCE_QUENCH:
 498                 /* Just silently ignore these. */
 499                 goto out;
 500         case ICMP_PARAMETERPROB:
 501                 err = EPROTO;
 502                 break;
 503         case ICMP_DEST_UNREACH:
 504                 if (code > NR_ICMP_UNREACH)
 505                         goto out;
 506
 507                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 508                         /* We are not interested in TCP_LISTEN and open_requests
 509                          * (SYN-ACKs send out by Linux are always <576bytes so
 510                          * they should go through unfragmented).
 511                          */
 512                         if (sk->sk_state == TCP_LISTEN)
 513                                 goto out;
 514
 515                         tp->mtu_info = info;
 516                         if (!sock_owned_by_user(sk)) {
 517                                 tcp_v4_mtu_reduced(sk);
 518                         } else {
 519                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 520                                         sock_hold(sk);
 521                         }
 522                         goto out;
 523                 }
 524
 525                 err = icmp_err_convert[code].errno;
 526                 /* check if icmp_skb allows revert of backoff
 527                  * (see draft-zimmermann-tcp-lcd) */
 528                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 529                         break;
 530                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 531                     !icsk->icsk_backoff || fastopen)
 532                         break;
 533
 534                 if (sock_owned_by_user(sk))
 535                         break;
 536
 537                 icsk->icsk_backoff--;
 538                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 539                                                TCP_TIMEOUT_INIT;
 540                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 541
 542                 skb = tcp_rtx_queue_head(sk);
 543                 BUG_ON(!skb);
 544
 545                 tcp_mstamp_refresh(tp);
 546                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 547                 remaining = icsk->icsk_rto -
 548                             usecs_to_jiffies(delta_us);
 549
 550                 if (remaining > 0) {
 551                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 552                                                   remaining, TCP_RTO_MAX);
 553                 } else {
 554                         /* RTO revert clocked out retransmission.
 555                          * Will retransmit now */
 556                         tcp_retransmit_timer(sk);
 557                 }
 558
 559                 break;
 560         case ICMP_TIME_EXCEEDED:
 561                 err = EHOSTUNREACH;
 562                 break;
 563         default:
 564                 goto out;
 565         }
 566
 567         switch (sk->sk_state) {
 568         case TCP_SYN_SENT:
 569         case TCP_SYN_RECV:
 570                 /* Only in fast or simultaneous open. If a fast open socket is
 571                  * is already accepted it is treated as a connected one below.
 572                  */
 573                 if (fastopen && !fastopen->sk)
 574                         break;
 575
 576                 if (!sock_owned_by_user(sk)) {
 577                         sk->sk_err = err;
 578
 579                         sk->sk_error_report(sk);
 580
 581                         tcp_done(sk);
 582                 } else {
 583                         sk->sk_err_soft = err;
 584                 }
 585                 goto out;
 586         }
 587
 588         /* If we've already connected we will keep trying
 589          * until we time out, or the user gives up.
 590          *
 591          * rfc1122 4.2.3.9 allows to consider as hard errors
 592          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 593          * but it is obsoleted by pmtu discovery).
 594          *
 595          * Note, that in modern internet, where routing is unreliable
 596          * and in each dark corner broken firewalls sit, sending random
 597          * errors ordered by their masters even this two messages finally lose
 598          * their original sense (even Linux sends invalid PORT_UNREACHs)
 599          *
 600          * Now we are in compliance with RFCs.
 601          *                                                      --ANK (980905)
 602          */
 603
 604         inet = inet_sk(sk);
 605         if (!sock_owned_by_user(sk) && inet->recverr) {
 606                 sk->sk_err = err;
 607                 sk->sk_error_report(sk);
 608         } else  { /* Only an error on timeout */
 609                 sk->sk_err_soft = err;
 610         }
 611
 612 out:
 613         bh_unlock_sock(sk);
 614         sock_put(sk);
 615 }
 616
 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618 {
 619         struct tcphdr *th = tcp_hdr(skb);
 620
 621         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622         skb->csum_start = skb_transport_header(skb) - skb->head;
 623         skb->csum_offset = offsetof(struct tcphdr, check);
 624 }
 625
 626 /* This routine computes an IPv4 TCP checksum. */
 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628 {
 629         const struct inet_sock *inet = inet_sk(sk);
 630
 631         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632 }
 633 EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635 /*
 636  *      This routine will send an RST to the other tcp.
 637  *
 638  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639  *                    for reset.
 640  *      Answer: if a packet caused RST, it is not for a socket
 641  *              existing in our system, if it is matched to a socket,
 642  *              it is just duplicate segment or bug in other side's TCP.
 643  *              So that we build reply only basing on parameters
 644  *              arrived with segment.
 645  *      Exception: precedence violation. We do not implement it in any case.
 646  */
 647
 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649 {
 650         const struct tcphdr *th = tcp_hdr(skb);
 651         struct {
 652                 struct tcphdr th;
 653 #ifdef CONFIG_TCP_MD5SIG
 654                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655 #endif
 656         } rep;
 657         struct ip_reply_arg arg;
 658 #ifdef CONFIG_TCP_MD5SIG
 659         struct tcp_md5sig_key *key = NULL;
 660         const __u8 *hash_location = NULL;
 661         unsigned char newhash[16];
 662         int genhash;
 663         struct sock *sk1 = NULL;
 664 #endif
 665         struct net *net;
 666         struct sock *ctl_sk;
 667
 668         /* Never send a reset in response to a reset. */
 669         if (th->rst)
 670                 return;
 671
 672         /* If sk not NULL, it means we did a successful lookup and incoming
 673          * route had to be correct. prequeue might have dropped our dst.
 674          */
 675         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 676                 return;
 677
 678         /* Swap the send and the receive. */
 679         memset(&rep, 0, sizeof(rep));
 680         rep.th.dest   = th->source;
 681         rep.th.source = th->dest;
 682         rep.th.doff   = sizeof(struct tcphdr) / 4;
 683         rep.th.rst    = 1;
 684
 685         if (th->ack) {
 686                 rep.th.seq = th->ack_seq;
 687         } else {
 688                 rep.th.ack = 1;
 689                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 690                                        skb->len - (th->doff << 2));
 691         }
 692
 693         memset(&arg, 0, sizeof(arg));
 694         arg.iov[0].iov_base = (unsigned char *)&rep;
 695         arg.iov[0].iov_len  = sizeof(rep.th);
 696
 697         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 698 #ifdef CONFIG_TCP_MD5SIG
 699         rcu_read_lock();
 700         hash_location = tcp_parse_md5sig_option(th);
 701         if (sk && sk_fullsock(sk)) {
 702                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 703                                         &ip_hdr(skb)->saddr, AF_INET);
 704         } else if (hash_location) {
 705                 /*
 706                  * active side is lost. Try to find listening socket through
 707                  * source port, and then find md5 key through listening socket.
 708                  * we are not loose security here:
 709                  * Incoming packet is checked with md5 hash with finding key,
 710                  * no RST generated if md5 hash doesn't match.
 711                  */
 712                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 713                                              ip_hdr(skb)->saddr,
 714                                              th->source, ip_hdr(skb)->daddr,
 715                                              ntohs(th->source), inet_iif(skb),
 716                                              tcp_v4_sdif(skb));
 717                 /* don't send rst if it can't find key */
 718                 if (!sk1)
 719                         goto out;
 720
 721                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 722                                         &ip_hdr(skb)->saddr, AF_INET);
 723                 if (!key)
 724                         goto out;
 725
 726
 727                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 728                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 729                         goto out;
 730
 731         }
 732
 733         if (key) {
 734                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 735                                    (TCPOPT_NOP << 16) |
 736                                    (TCPOPT_MD5SIG << 8) |
 737                                    TCPOLEN_MD5SIG);
 738                 /* Update length and the length the header thinks exists */
 739                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 740                 rep.th.doff = arg.iov[0].iov_len / 4;
 741
 742                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 743                                      key, ip_hdr(skb)->saddr,
 744                                      ip_hdr(skb)->daddr, &rep.th);
 745         }
 746 #endif
 747         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 748                                       ip_hdr(skb)->saddr, /* XXX */
 749                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 750         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 751         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 752
 753         /* When socket is gone, all binding information is lost.
 754          * routing might fail in this case. No choice here, if we choose to force
 755          * input interface, we will misroute in case of asymmetric route.
 756          */
 757         if (sk) {
 758                 arg.bound_dev_if = sk->sk_bound_dev_if;
 759                 if (sk_fullsock(sk))
 760                         trace_tcp_send_reset(sk, skb);
 761         }
 762
 763         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 764                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 765
 766         arg.tos = ip_hdr(skb)->tos;
 767         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 768         local_bh_disable();
 769         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 770         if (sk)
 771                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 772                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 773         ip_send_unicast_reply(ctl_sk,
 774                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 775                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 776                               &arg, arg.iov[0].iov_len);
 777
 778         ctl_sk->sk_mark = 0;
 779         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 780         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 781         local_bh_enable();
 782
 783 #ifdef CONFIG_TCP_MD5SIG
 784 out:
 785         rcu_read_unlock();
 786 #endif
 787 }
 788
 789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 790    outside socket context is ugly, certainly. What can I do?
 791  */
 792
 793 static void tcp_v4_send_ack(const struct sock *sk,
 794                             struct sk_buff *skb, u32 seq, u32 ack,
 795                             u32 win, u32 tsval, u32 tsecr, int oif,
 796                             struct tcp_md5sig_key *key,
 797                             int reply_flags, u8 tos)
 798 {
 799         const struct tcphdr *th = tcp_hdr(skb);
 800         struct {
 801                 struct tcphdr th;
 802                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 803 #ifdef CONFIG_TCP_MD5SIG
 804                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 805 #endif
 806                         ];
 807         } rep;
 808         struct net *net = sock_net(sk);
 809         struct ip_reply_arg arg;
 810         struct sock *ctl_sk;
 811
 812         memset(&rep.th, 0, sizeof(struct tcphdr));
 813         memset(&arg, 0, sizeof(arg));
 814
 815         arg.iov[0].iov_base = (unsigned char *)&rep;
 816         arg.iov[0].iov_len  = sizeof(rep.th);
 817         if (tsecr) {
 818                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 819                                    (TCPOPT_TIMESTAMP << 8) |
 820                                    TCPOLEN_TIMESTAMP);
 821                 rep.opt[1] = htonl(tsval);
 822                 rep.opt[2] = htonl(tsecr);
 823                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 824         }
 825
 826         /* Swap the send and the receive. */
 827         rep.th.dest    = th->source;
 828         rep.th.source  = th->dest;
 829         rep.th.doff    = arg.iov[0].iov_len / 4;
 830         rep.th.seq     = htonl(seq);
 831         rep.th.ack_seq = htonl(ack);
 832         rep.th.ack     = 1;
 833         rep.th.window  = htons(win);
 834
 835 #ifdef CONFIG_TCP_MD5SIG
 836         if (key) {
 837                 int offset = (tsecr) ? 3 : 0;
 838
 839                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 840                                           (TCPOPT_NOP << 16) |
 841                                           (TCPOPT_MD5SIG << 8) |
 842                                           TCPOLEN_MD5SIG);
 843                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 844                 rep.th.doff = arg.iov[0].iov_len/4;
 845
 846                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 847                                     key, ip_hdr(skb)->saddr,
 848                                     ip_hdr(skb)->daddr, &rep.th);
 849         }
 850 #endif
 851         arg.flags = reply_flags;
 852         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 853                                       ip_hdr(skb)->saddr, /* XXX */
 854                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 855         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 856         if (oif)
 857                 arg.bound_dev_if = oif;
 858         arg.tos = tos;
 859         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 860         local_bh_disable();
 861         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 862         if (sk)
 863                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 864                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 865         ip_send_unicast_reply(ctl_sk,
 866                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 867                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 868                               &arg, arg.iov[0].iov_len);
 869
 870         ctl_sk->sk_mark = 0;
 871         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 872         local_bh_enable();
 873 }
 874
 875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 876 {
 877         struct inet_timewait_sock *tw = inet_twsk(sk);
 878         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 879
 880         tcp_v4_send_ack(sk, skb,
 881                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 882                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 883                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 884                         tcptw->tw_ts_recent,
 885                         tw->tw_bound_dev_if,
 886                         tcp_twsk_md5_key(tcptw),
 887                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 888                         tw->tw_tos
 889                         );
 890
 891         inet_twsk_put(tw);
 892 }
 893
 894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 895                                   struct request_sock *req)
 896 {
 897         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 898          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 899          */
 900         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 901                                              tcp_sk(sk)->snd_nxt;
 902
 903         /* RFC 7323 2.3
 904          * The window field (SEG.WND) of every outgoing segment, with the
 905          * exception of <SYN> segments, MUST be right-shifted by
 906          * Rcv.Wind.Shift bits:
 907          */
 908         tcp_v4_send_ack(sk, skb, seq,
 909                         tcp_rsk(req)->rcv_nxt,
 910                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 911                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 912                         req->ts_recent,
 913                         0,
 914                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 915                                           AF_INET),
 916                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 917                         ip_hdr(skb)->tos);
 918 }
 919
 920 /*
 921  *      Send a SYN-ACK after having received a SYN.
 922  *      This still operates on a request_sock only, not on a big
 923  *      socket.
 924  */
 925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 926                               struct flowi *fl,
 927                               struct request_sock *req,
 928                               struct tcp_fastopen_cookie *foc,
 929                               enum tcp_synack_type synack_type)
 930 {
 931         const struct inet_request_sock *ireq = inet_rsk(req);
 932         struct flowi4 fl4;
 933         int err = -1;
 934         struct sk_buff *skb;
 935
 936         /* First, grab a route. */
 937         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 938                 return -1;
 939
 940         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 941
 942         if (skb) {
 943                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 944
 945                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 946                                             ireq->ir_rmt_addr,
 947                                             ireq_opt_deref(ireq));
 948                 err = net_xmit_eval(err);
 949         }
 950
 951         return err;
 952 }
 953
 954 /*
 955  *      IPv4 request_sock destructor.
 956  */
 957 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 958 {
 959         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 960 }
 961
 962 #ifdef CONFIG_TCP_MD5SIG
 963 /*
 964  * RFC2385 MD5 checksumming requires a mapping of
 965  * IP address->MD5 Key.
 966  * We need to maintain these in the sk structure.
 967  */
 968
 969 /* Find the Key structure for an address.  */
 970 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 971                                          const union tcp_md5_addr *addr,
 972                                          int family)
 973 {
 974         const struct tcp_sock *tp = tcp_sk(sk);
 975         struct tcp_md5sig_key *key;
 976         const struct tcp_md5sig_info *md5sig;
 977         __be32 mask;
 978         struct tcp_md5sig_key *best_match = NULL;
 979         bool match;
 980
 981         /* caller either holds rcu_read_lock() or socket lock */
 982         md5sig = rcu_dereference_check(tp->md5sig_info,
 983                                        lockdep_sock_is_held(sk));
 984         if (!md5sig)
 985                 return NULL;
 986
 987         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 988                 if (key->family != family)
 989                         continue;
 990
 991                 if (family == AF_INET) {
 992                         mask = inet_make_mask(key->prefixlen);
 993                         match = (key->addr.a4.s_addr & mask) ==
 994                                 (addr->a4.s_addr & mask);
 995 #if IS_ENABLED(CONFIG_IPV6)
 996                 } else if (family == AF_INET6) {
 997                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 998                                                   key->prefixlen);
 999 #endif
1000                 } else {
1001                         match = false;
1002                 }
1003
1004                 if (match && (!best_match ||
1005                               key->prefixlen > best_match->prefixlen))
1006                         best_match = key;
1007         }
1008         return best_match;
1009 }
1010 EXPORT_SYMBOL(tcp_md5_do_lookup);
1011
1012 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1013                                                       const union tcp_md5_addr *addr,
1014                                                       int family, u8 prefixlen)
1015 {
1016         const struct tcp_sock *tp = tcp_sk(sk);
1017         struct tcp_md5sig_key *key;
1018         unsigned int size = sizeof(struct in_addr);
1019         const struct tcp_md5sig_info *md5sig;
1020
1021         /* caller either holds rcu_read_lock() or socket lock */
1022         md5sig = rcu_dereference_check(tp->md5sig_info,
1023                                        lockdep_sock_is_held(sk));
1024         if (!md5sig)
1025                 return NULL;
1026 #if IS_ENABLED(CONFIG_IPV6)
1027         if (family == AF_INET6)
1028                 size = sizeof(struct in6_addr);
1029 #endif
1030         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1031                 if (key->family != family)
1032                         continue;
1033                 if (!memcmp(&key->addr, addr, size) &&
1034                     key->prefixlen == prefixlen)
1035                         return key;
1036         }
1037         return NULL;
1038 }
1039
1040 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1041                                          const struct sock *addr_sk)
1042 {
1043         const union tcp_md5_addr *addr;
1044
1045         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1046         return tcp_md5_do_lookup(sk, addr, AF_INET);
1047 }
1048 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1049
1050 /* This can be called on a newly created socket, from other files */
1051 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1052                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1053                    gfp_t gfp)
1054 {
1055         /* Add Key to the list */
1056         struct tcp_md5sig_key *key;
1057         struct tcp_sock *tp = tcp_sk(sk);
1058         struct tcp_md5sig_info *md5sig;
1059
1060         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1061         if (key) {
1062                 /* Pre-existing entry - just update that one. */
1063                 memcpy(key->key, newkey, newkeylen);
1064                 key->keylen = newkeylen;
1065                 return 0;
1066         }
1067
1068         md5sig = rcu_dereference_protected(tp->md5sig_info,
1069                                            lockdep_sock_is_held(sk));
1070         if (!md5sig) {
1071                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1072                 if (!md5sig)
1073                         return -ENOMEM;
1074
1075                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1076                 INIT_HLIST_HEAD(&md5sig->head);
1077                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1078         }
1079
1080         key = sock_kmalloc(sk, sizeof(*key), gfp);
1081         if (!key)
1082                 return -ENOMEM;
1083         if (!tcp_alloc_md5sig_pool()) {
1084                 sock_kfree_s(sk, key, sizeof(*key));
1085                 return -ENOMEM;
1086         }
1087
1088         memcpy(key->key, newkey, newkeylen);
1089         key->keylen = newkeylen;
1090         key->family = family;
1091         key->prefixlen = prefixlen;
1092         memcpy(&key->addr, addr,
1093                (family == AF_INET6) ? sizeof(struct in6_addr) :
1094                                       sizeof(struct in_addr));
1095         hlist_add_head_rcu(&key->node, &md5sig->head);
1096         return 0;
1097 }
1098 EXPORT_SYMBOL(tcp_md5_do_add);
1099
1100 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1101                    u8 prefixlen)
1102 {
1103         struct tcp_md5sig_key *key;
1104
1105         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1106         if (!key)
1107                 return -ENOENT;
1108         hlist_del_rcu(&key->node);
1109         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1110         kfree_rcu(key, rcu);
1111         return 0;
1112 }
1113 EXPORT_SYMBOL(tcp_md5_do_del);
1114
1115 static void tcp_clear_md5_list(struct sock *sk)
1116 {
1117         struct tcp_sock *tp = tcp_sk(sk);
1118         struct tcp_md5sig_key *key;
1119         struct hlist_node *n;
1120         struct tcp_md5sig_info *md5sig;
1121
1122         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1123
1124         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1125                 hlist_del_rcu(&key->node);
1126                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1127                 kfree_rcu(key, rcu);
1128         }
1129 }
1130
1131 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1132                                  char __user *optval, int optlen)
1133 {
1134         struct tcp_md5sig cmd;
1135         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1136         u8 prefixlen = 32;
1137
1138         if (optlen < sizeof(cmd))
1139                 return -EINVAL;
1140
1141         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1142                 return -EFAULT;
1143
1144         if (sin->sin_family != AF_INET)
1145                 return -EINVAL;
1146
1147         if (optname == TCP_MD5SIG_EXT &&
1148             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1149                 prefixlen = cmd.tcpm_prefixlen;
1150                 if (prefixlen > 32)
1151                         return -EINVAL;
1152         }
1153
1154         if (!cmd.tcpm_keylen)
1155                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1156                                       AF_INET, prefixlen);
1157
1158         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1159                 return -EINVAL;
1160
1161         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1162                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1163                               GFP_KERNEL);
1164 }
1165
1166 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1167                                    __be32 daddr, __be32 saddr,
1168                                    const struct tcphdr *th, int nbytes)
1169 {
1170         struct tcp4_pseudohdr *bp;
1171         struct scatterlist sg;
1172         struct tcphdr *_th;
1173
1174         bp = hp->scratch;
1175         bp->saddr = saddr;
1176         bp->daddr = daddr;
1177         bp->pad = 0;
1178         bp->protocol = IPPROTO_TCP;
1179         bp->len = cpu_to_be16(nbytes);
1180
1181         _th = (struct tcphdr *)(bp + 1);
1182         memcpy(_th, th, sizeof(*th));
1183         _th->check = 0;
1184
1185         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1186         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1187                                 sizeof(*bp) + sizeof(*th));
1188         return crypto_ahash_update(hp->md5_req);
1189 }
1190
1191 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1192                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1193 {
1194         struct tcp_md5sig_pool *hp;
1195         struct ahash_request *req;
1196
1197         hp = tcp_get_md5sig_pool();
1198         if (!hp)
1199                 goto clear_hash_noput;
1200         req = hp->md5_req;
1201
1202         if (crypto_ahash_init(req))
1203                 goto clear_hash;
1204         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1205                 goto clear_hash;
1206         if (tcp_md5_hash_key(hp, key))
1207                 goto clear_hash;
1208         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1209         if (crypto_ahash_final(req))
1210                 goto clear_hash;
1211
1212         tcp_put_md5sig_pool();
1213         return 0;
1214
1215 clear_hash:
1216         tcp_put_md5sig_pool();
1217 clear_hash_noput:
1218         memset(md5_hash, 0, 16);
1219         return 1;
1220 }
1221
1222 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1223                         const struct sock *sk,
1224                         const struct sk_buff *skb)
1225 {
1226         struct tcp_md5sig_pool *hp;
1227         struct ahash_request *req;
1228         const struct tcphdr *th = tcp_hdr(skb);
1229         __be32 saddr, daddr;
1230
1231         if (sk) { /* valid for establish/request sockets */
1232                 saddr = sk->sk_rcv_saddr;
1233                 daddr = sk->sk_daddr;
1234         } else {
1235                 const struct iphdr *iph = ip_hdr(skb);
1236                 saddr = iph->saddr;
1237                 daddr = iph->daddr;
1238         }
1239
1240         hp = tcp_get_md5sig_pool();
1241         if (!hp)
1242                 goto clear_hash_noput;
1243         req = hp->md5_req;
1244
1245         if (crypto_ahash_init(req))
1246                 goto clear_hash;
1247
1248         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1249                 goto clear_hash;
1250         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1251                 goto clear_hash;
1252         if (tcp_md5_hash_key(hp, key))
1253                 goto clear_hash;
1254         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1255         if (crypto_ahash_final(req))
1256                 goto clear_hash;
1257
1258         tcp_put_md5sig_pool();
1259         return 0;
1260
1261 clear_hash:
1262         tcp_put_md5sig_pool();
1263 clear_hash_noput:
1264         memset(md5_hash, 0, 16);
1265         return 1;
1266 }
1267 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1268
1269 #endif
1270
1271 /* Called with rcu_read_lock() */
1272 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1273                                     const struct sk_buff *skb)
1274 {
1275 #ifdef CONFIG_TCP_MD5SIG
1276         /*
1277          * This gets called for each TCP segment that arrives
1278          * so we want to be efficient.
1279          * We have 3 drop cases:
1280          * o No MD5 hash and one expected.
1281          * o MD5 hash and we're not expecting one.
1282          * o MD5 hash and its wrong.
1283          */
1284         const __u8 *hash_location = NULL;
1285         struct tcp_md5sig_key *hash_expected;
1286         const struct iphdr *iph = ip_hdr(skb);
1287         const struct tcphdr *th = tcp_hdr(skb);
1288         int genhash;
1289         unsigned char newhash[16];
1290
1291         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1292                                           AF_INET);
1293         hash_location = tcp_parse_md5sig_option(th);
1294
1295         /* We've parsed the options - do we have a hash? */
1296         if (!hash_expected && !hash_location)
1297                 return false;
1298
1299         if (hash_expected && !hash_location) {
1300                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1301                 return true;
1302         }
1303
1304         if (!hash_expected && hash_location) {
1305                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1306                 return true;
1307         }
1308
1309         /* Okay, so this is hash_expected and hash_location -
1310          * so we need to calculate the checksum.
1311          */
1312         genhash = tcp_v4_md5_hash_skb(newhash,
1313                                       hash_expected,
1314                                       NULL, skb);
1315
1316         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1317                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1318                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1319                                      &iph->saddr, ntohs(th->source),
1320                                      &iph->daddr, ntohs(th->dest),
1321                                      genhash ? " tcp_v4_calc_md5_hash failed"
1322                                      : "");
1323                 return true;
1324         }
1325         return false;
1326 #endif
1327         return false;
1328 }
1329
1330 static void tcp_v4_init_req(struct request_sock *req,
1331                             const struct sock *sk_listener,
1332                             struct sk_buff *skb)
1333 {
1334         struct inet_request_sock *ireq = inet_rsk(req);
1335         struct net *net = sock_net(sk_listener);
1336
1337         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1338         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1339         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1340 }
1341
1342 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1343                                           struct flowi *fl,
1344                                           const struct request_sock *req)
1345 {
1346         return inet_csk_route_req(sk, &fl->u.ip4, req);
1347 }
1348
1349 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1350         .family         =       PF_INET,
1351         .obj_size       =       sizeof(struct tcp_request_sock),
1352         .rtx_syn_ack    =       tcp_rtx_synack,
1353         .send_ack       =       tcp_v4_reqsk_send_ack,
1354         .destructor     =       tcp_v4_reqsk_destructor,
1355         .send_reset     =       tcp_v4_send_reset,
1356         .syn_ack_timeout =      tcp_syn_ack_timeout,
1357 };
1358
1359 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1360         .mss_clamp      =       TCP_MSS_DEFAULT,
1361 #ifdef CONFIG_TCP_MD5SIG
1362         .req_md5_lookup =       tcp_v4_md5_lookup,
1363         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1364 #endif
1365         .init_req       =       tcp_v4_init_req,
1366 #ifdef CONFIG_SYN_COOKIES
1367         .cookie_init_seq =      cookie_v4_init_sequence,
1368 #endif
1369         .route_req      =       tcp_v4_route_req,
1370         .init_seq       =       tcp_v4_init_seq,
1371         .init_ts_off    =       tcp_v4_init_ts_off,
1372         .send_synack    =       tcp_v4_send_synack,
1373 };
1374
1375 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1376 {
1377         /* Never answer to SYNs send to broadcast or multicast */
1378         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1379                 goto drop;
1380
1381         return tcp_conn_request(&tcp_request_sock_ops,
1382                                 &tcp_request_sock_ipv4_ops, sk, skb);
1383
1384 drop:
1385         tcp_listendrop(sk);
1386         return 0;
1387 }
1388 EXPORT_SYMBOL(tcp_v4_conn_request);
1389
1390
1391 /*
1392  * The three way handshake has completed - we got a valid synack -
1393  * now create the new socket.
1394  */
1395 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1396                                   struct request_sock *req,
1397                                   struct dst_entry *dst,
1398                                   struct request_sock *req_unhash,
1399                                   bool *own_req)
1400 {
1401         struct inet_request_sock *ireq;
1402         struct inet_sock *newinet;
1403         struct tcp_sock *newtp;
1404         struct sock *newsk;
1405 #ifdef CONFIG_TCP_MD5SIG
1406         struct tcp_md5sig_key *key;
1407 #endif
1408         struct ip_options_rcu *inet_opt;
1409
1410         if (sk_acceptq_is_full(sk))
1411                 goto exit_overflow;
1412
1413         newsk = tcp_create_openreq_child(sk, req, skb);
1414         if (!newsk)
1415                 goto exit_nonewsk;
1416
1417         newsk->sk_gso_type = SKB_GSO_TCPV4;
1418         inet_sk_rx_dst_set(newsk, skb);
1419
1420         newtp                 = tcp_sk(newsk);
1421         newinet               = inet_sk(newsk);
1422         ireq                  = inet_rsk(req);
1423         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1424         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1425         newsk->sk_bound_dev_if = ireq->ir_iif;
1426         newinet->inet_saddr   = ireq->ir_loc_addr;
1427         inet_opt              = rcu_dereference(ireq->ireq_opt);
1428         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1429         newinet->mc_index     = inet_iif(skb);
1430         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1431         newinet->rcv_tos      = ip_hdr(skb)->tos;
1432         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1433         if (inet_opt)
1434                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1435         newinet->inet_id = newtp->write_seq ^ jiffies;
1436
1437         if (!dst) {
1438                 dst = inet_csk_route_child_sock(sk, newsk, req);
1439                 if (!dst)
1440                         goto put_and_exit;
1441         } else {
1442                 /* syncookie case : see end of cookie_v4_check() */
1443         }
1444         sk_setup_caps(newsk, dst);
1445
1446         tcp_ca_openreq_child(newsk, dst);
1447
1448         tcp_sync_mss(newsk, dst_mtu(dst));
1449         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1450
1451         tcp_initialize_rcv_mss(newsk);
1452
1453 #ifdef CONFIG_TCP_MD5SIG
1454         /* Copy over the MD5 key from the original socket */
1455         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1456                                 AF_INET);
1457         if (key) {
1458                 /*
1459                  * We're using one, so create a matching key
1460                  * on the newsk structure. If we fail to get
1461                  * memory, then we end up not copying the key
1462                  * across. Shucks.
1463                  */
1464                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1465                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1466                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1467         }
1468 #endif
1469
1470         if (__inet_inherit_port(sk, newsk) < 0)
1471                 goto put_and_exit;
1472         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1473         if (likely(*own_req)) {
1474                 tcp_move_syn(newtp, req);
1475                 ireq->ireq_opt = NULL;
1476         } else {
1477                 newinet->inet_opt = NULL;
1478         }
1479         return newsk;
1480
1481 exit_overflow:
1482         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1483 exit_nonewsk:
1484         dst_release(dst);
1485 exit:
1486         tcp_listendrop(sk);
1487         return NULL;
1488 put_and_exit:
1489         newinet->inet_opt = NULL;
1490         inet_csk_prepare_forced_close(newsk);
1491         tcp_done(newsk);
1492         goto exit;
1493 }
1494 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1495
1496 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1497 {
1498 #ifdef CONFIG_SYN_COOKIES
1499         const struct tcphdr *th = tcp_hdr(skb);
1500
1501         if (!th->syn)
1502                 sk = cookie_v4_check(sk, skb);
1503 #endif
1504         return sk;
1505 }
1506
1507 /* The socket must have it's spinlock held when we get
1508  * here, unless it is a TCP_LISTEN socket.
1509  *
1510  * We have a potential double-lock case here, so even when
1511  * doing backlog processing we use the BH locking scheme.
1512  * This is because we cannot sleep with the original spinlock
1513  * held.
1514  */
1515 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1516 {
1517         struct sock *rsk;
1518
1519         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1520                 struct dst_entry *dst = sk->sk_rx_dst;
1521
1522                 sock_rps_save_rxhash(sk, skb);
1523                 sk_mark_napi_id(sk, skb);
1524                 if (dst) {
1525                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1526                             !dst->ops->check(dst, 0)) {
1527                                 dst_release(dst);
1528                                 sk->sk_rx_dst = NULL;
1529                         }
1530                 }
1531                 tcp_rcv_established(sk, skb);
1532                 return 0;
1533         }
1534
1535         if (tcp_checksum_complete(skb))
1536                 goto csum_err;
1537
1538         if (sk->sk_state == TCP_LISTEN) {
1539                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1540
1541                 if (!nsk)
1542                         goto discard;
1543                 if (nsk != sk) {
1544                         if (tcp_child_process(sk, nsk, skb)) {
1545                                 rsk = nsk;
1546                                 goto reset;
1547                         }
1548                         return 0;
1549                 }
1550         } else
1551                 sock_rps_save_rxhash(sk, skb);
1552
1553         if (tcp_rcv_state_process(sk, skb)) {
1554                 rsk = sk;
1555                 goto reset;
1556         }
1557         return 0;
1558
1559 reset:
1560         tcp_v4_send_reset(rsk, skb);
1561 discard:
1562         kfree_skb(skb);
1563         /* Be careful here. If this function gets more complicated and
1564          * gcc suffers from register pressure on the x86, sk (in %ebx)
1565          * might be destroyed here. This current version compiles correctly,
1566          * but you have been warned.
1567          */
1568         return 0;
1569
1570 csum_err:
1571         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1572         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1573         goto discard;
1574 }
1575 EXPORT_SYMBOL(tcp_v4_do_rcv);
1576
1577 int tcp_v4_early_demux(struct sk_buff *skb)
1578 {
1579         const struct iphdr *iph;
1580         const struct tcphdr *th;
1581         struct sock *sk;
1582
1583         if (skb->pkt_type != PACKET_HOST)
1584                 return 0;
1585
1586         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1587                 return 0;
1588
1589         iph = ip_hdr(skb);
1590         th = tcp_hdr(skb);
1591
1592         if (th->doff < sizeof(struct tcphdr) / 4)
1593                 return 0;
1594
1595         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1596                                        iph->saddr, th->source,
1597                                        iph->daddr, ntohs(th->dest),
1598                                        skb->skb_iif, inet_sdif(skb));
1599         if (sk) {
1600                 skb->sk = sk;
1601                 skb->destructor = sock_edemux;
1602                 if (sk_fullsock(sk)) {
1603                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1604
1605                         if (dst)
1606                                 dst = dst_check(dst, 0);
1607                         if (dst &&
1608                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1609                                 skb_dst_set_noref(skb, dst);
1610                 }
1611         }
1612         return 0;
1613 }
1614
1615 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1616 {
1617         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1618
1619         /* Only socket owner can try to collapse/prune rx queues
1620          * to reduce memory overhead, so add a little headroom here.
1621          * Few sockets backlog are possibly concurrently non empty.
1622          */
1623         limit += 64*1024;
1624
1625         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1626          * we can fix skb->truesize to its real value to avoid future drops.
1627          * This is valid because skb is not yet charged to the socket.
1628          * It has been noticed pure SACK packets were sometimes dropped
1629          * (if cooked by drivers without copybreak feature).
1630          */
1631         skb_condense(skb);
1632
1633         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1634                 bh_unlock_sock(sk);
1635                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1636                 return true;
1637         }
1638         return false;
1639 }
1640 EXPORT_SYMBOL(tcp_add_backlog);
1641
1642 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1643 {
1644         struct tcphdr *th = (struct tcphdr *)skb->data;
1645         unsigned int eaten = skb->len;
1646         int err;
1647
1648         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1649         if (!err) {
1650                 eaten -= skb->len;
1651                 TCP_SKB_CB(skb)->end_seq -= eaten;
1652         }
1653         return err;
1654 }
1655 EXPORT_SYMBOL(tcp_filter);
1656
1657 static void tcp_v4_restore_cb(struct sk_buff *skb)
1658 {
1659         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1660                 sizeof(struct inet_skb_parm));
1661 }
1662
1663 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1664                            const struct tcphdr *th)
1665 {
1666         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1667          * barrier() makes sure compiler wont play fool^Waliasing games.
1668          */
1669         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1670                 sizeof(struct inet_skb_parm));
1671         barrier();
1672
1673         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1674         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1675                                     skb->len - th->doff * 4);
1676         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1677         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1678         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1679         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1680         TCP_SKB_CB(skb)->sacked  = 0;
1681         TCP_SKB_CB(skb)->has_rxtstamp =
1682                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1683 }
1684
1685 /*
1686  *      From tcp_input.c
1687  */
1688
1689 int tcp_v4_rcv(struct sk_buff *skb)
1690 {
1691         struct net *net = dev_net(skb->dev);
1692         int sdif = inet_sdif(skb);
1693         const struct iphdr *iph;
1694         const struct tcphdr *th;
1695         bool refcounted;
1696         struct sock *sk;
1697         int ret;
1698
1699         if (skb->pkt_type != PACKET_HOST)
1700                 goto discard_it;
1701
1702         /* Count it even if it's bad */
1703         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1704
1705         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1706                 goto discard_it;
1707
1708         th = (const struct tcphdr *)skb->data;
1709
1710         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1711                 goto bad_packet;
1712         if (!pskb_may_pull(skb, th->doff * 4))
1713                 goto discard_it;
1714
1715         /* An explanation is required here, I think.
1716          * Packet length and doff are validated by header prediction,
1717          * provided case of th->doff==0 is eliminated.
1718          * So, we defer the checks. */
1719
1720         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1721                 goto csum_error;
1722
1723         th = (const struct tcphdr *)skb->data;
1724         iph = ip_hdr(skb);
1725 lookup:
1726         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1727                                th->dest, sdif, &refcounted);
1728         if (!sk)
1729                 goto no_tcp_socket;
1730
1731 process:
1732         if (sk->sk_state == TCP_TIME_WAIT)
1733                 goto do_time_wait;
1734
1735         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1736                 struct request_sock *req = inet_reqsk(sk);
1737                 bool req_stolen = false;
1738                 struct sock *nsk;
1739
1740                 sk = req->rsk_listener;
1741                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1742                         sk_drops_add(sk, skb);
1743                         reqsk_put(req);
1744                         goto discard_it;
1745                 }
1746                 if (tcp_checksum_complete(skb)) {
1747                         reqsk_put(req);
1748                         goto csum_error;
1749                 }
1750                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1751                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1752                         goto lookup;
1753                 }
1754                 /* We own a reference on the listener, increase it again
1755                  * as we might lose it too soon.
1756                  */
1757                 sock_hold(sk);
1758                 refcounted = true;
1759                 nsk = NULL;
1760                 if (!tcp_filter(sk, skb)) {
1761                         th = (const struct tcphdr *)skb->data;
1762                         iph = ip_hdr(skb);
1763                         tcp_v4_fill_cb(skb, iph, th);
1764                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1765                 }
1766                 if (!nsk) {
1767                         reqsk_put(req);
1768                         if (req_stolen) {
1769                                 /* Another cpu got exclusive access to req
1770                                  * and created a full blown socket.
1771                                  * Try to feed this packet to this socket
1772                                  * instead of discarding it.
1773                                  */
1774                                 tcp_v4_restore_cb(skb);
1775                                 sock_put(sk);
1776                                 goto lookup;
1777                         }
1778                         goto discard_and_relse;
1779                 }
1780                 if (nsk == sk) {
1781                         reqsk_put(req);
1782                         tcp_v4_restore_cb(skb);
1783                 } else if (tcp_child_process(sk, nsk, skb)) {
1784                         tcp_v4_send_reset(nsk, skb);
1785                         goto discard_and_relse;
1786                 } else {
1787                         sock_put(sk);
1788                         return 0;
1789                 }
1790         }
1791         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1792                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1793                 goto discard_and_relse;
1794         }
1795
1796         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1797                 goto discard_and_relse;
1798
1799         if (tcp_v4_inbound_md5_hash(sk, skb))
1800                 goto discard_and_relse;
1801
1802         nf_reset(skb);
1803
1804         if (tcp_filter(sk, skb))
1805                 goto discard_and_relse;
1806         th = (const struct tcphdr *)skb->data;
1807         iph = ip_hdr(skb);
1808         tcp_v4_fill_cb(skb, iph, th);
1809
1810         skb->dev = NULL;
1811
1812         if (sk->sk_state == TCP_LISTEN) {
1813                 ret = tcp_v4_do_rcv(sk, skb);
1814                 goto put_and_return;
1815         }
1816
1817         sk_incoming_cpu_update(sk);
1818
1819         bh_lock_sock_nested(sk);
1820         tcp_segs_in(tcp_sk(sk), skb);
1821         ret = 0;
1822         if (!sock_owned_by_user(sk)) {
1823                 ret = tcp_v4_do_rcv(sk, skb);
1824         } else if (tcp_add_backlog(sk, skb)) {
1825                 goto discard_and_relse;
1826         }
1827         bh_unlock_sock(sk);
1828
1829 put_and_return:
1830         if (refcounted)
1831                 sock_put(sk);
1832
1833         return ret;
1834
1835 no_tcp_socket:
1836         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1837                 goto discard_it;
1838
1839         tcp_v4_fill_cb(skb, iph, th);
1840
1841         if (tcp_checksum_complete(skb)) {
1842 csum_error:
1843                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1844 bad_packet:
1845                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1846         } else {
1847                 tcp_v4_send_reset(NULL, skb);
1848         }
1849
1850 discard_it:
1851         /* Discard frame. */
1852         kfree_skb(skb);
1853         return 0;
1854
1855 discard_and_relse:
1856         sk_drops_add(sk, skb);
1857         if (refcounted)
1858                 sock_put(sk);
1859         goto discard_it;
1860
1861 do_time_wait:
1862         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1863                 inet_twsk_put(inet_twsk(sk));
1864                 goto discard_it;
1865         }
1866
1867         tcp_v4_fill_cb(skb, iph, th);
1868
1869         if (tcp_checksum_complete(skb)) {
1870                 inet_twsk_put(inet_twsk(sk));
1871                 goto csum_error;
1872         }
1873         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1874         case TCP_TW_SYN: {
1875                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1876                                                         &tcp_hashinfo, skb,
1877                                                         __tcp_hdrlen(th),
1878                                                         iph->saddr, th->source,
1879                                                         iph->daddr, th->dest,
1880                                                         inet_iif(skb),
1881                                                         sdif);
1882                 if (sk2) {
1883                         inet_twsk_deschedule_put(inet_twsk(sk));
1884                         sk = sk2;
1885                         tcp_v4_restore_cb(skb);
1886                         refcounted = false;
1887                         goto process;
1888                 }
1889         }
1890                 /* to ACK */
1891                 /* fall through */
1892         case TCP_TW_ACK:
1893                 tcp_v4_timewait_ack(sk, skb);
1894                 break;
1895         case TCP_TW_RST:
1896                 tcp_v4_send_reset(sk, skb);
1897                 inet_twsk_deschedule_put(inet_twsk(sk));
1898                 goto discard_it;
1899         case TCP_TW_SUCCESS:;
1900         }
1901         goto discard_it;
1902 }
1903
1904 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1905         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1906         .twsk_unique    = tcp_twsk_unique,
1907         .twsk_destructor= tcp_twsk_destructor,
1908 };
1909
1910 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1911 {
1912         struct dst_entry *dst = skb_dst(skb);
1913
1914         if (dst && dst_hold_safe(dst)) {
1915                 sk->sk_rx_dst = dst;
1916                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1917         }
1918 }
1919 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1920
1921 const struct inet_connection_sock_af_ops ipv4_specific = {
1922         .queue_xmit        = ip_queue_xmit,
1923         .send_check        = tcp_v4_send_check,
1924         .rebuild_header    = inet_sk_rebuild_header,
1925         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1926         .conn_request      = tcp_v4_conn_request,
1927         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1928         .net_header_len    = sizeof(struct iphdr),
1929         .setsockopt        = ip_setsockopt,
1930         .getsockopt        = ip_getsockopt,
1931         .addr2sockaddr     = inet_csk_addr2sockaddr,
1932         .sockaddr_len      = sizeof(struct sockaddr_in),
1933 #ifdef CONFIG_COMPAT
1934         .compat_setsockopt = compat_ip_setsockopt,
1935         .compat_getsockopt = compat_ip_getsockopt,
1936 #endif
1937         .mtu_reduced       = tcp_v4_mtu_reduced,
1938 };
1939 EXPORT_SYMBOL(ipv4_specific);
1940
1941 #ifdef CONFIG_TCP_MD5SIG
1942 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1943         .md5_lookup             = tcp_v4_md5_lookup,
1944         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1945         .md5_parse              = tcp_v4_parse_md5_keys,
1946 };
1947 #endif
1948
1949 /* NOTE: A lot of things set to zero explicitly by call to
1950  *       sk_alloc() so need not be done here.
1951  */
1952 static int tcp_v4_init_sock(struct sock *sk)
1953 {
1954         struct inet_connection_sock *icsk = inet_csk(sk);
1955
1956         tcp_init_sock(sk);
1957
1958         icsk->icsk_af_ops = &ipv4_specific;
1959
1960 #ifdef CONFIG_TCP_MD5SIG
1961         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1962 #endif
1963
1964         return 0;
1965 }
1966
1967 void tcp_v4_destroy_sock(struct sock *sk)
1968 {
1969         struct tcp_sock *tp = tcp_sk(sk);
1970
1971         trace_tcp_destroy_sock(sk);
1972
1973         tcp_clear_xmit_timers(sk);
1974
1975         tcp_cleanup_congestion_control(sk);
1976
1977         tcp_cleanup_ulp(sk);
1978
1979         /* Cleanup up the write buffer. */
1980         tcp_write_queue_purge(sk);
1981
1982         /* Check if we want to disable active TFO */
1983         tcp_fastopen_active_disable_ofo_check(sk);
1984
1985         /* Cleans up our, hopefully empty, out_of_order_queue. */
1986         skb_rbtree_purge(&tp->out_of_order_queue);
1987
1988 #ifdef CONFIG_TCP_MD5SIG
1989         /* Clean up the MD5 key list, if any */
1990         if (tp->md5sig_info) {
1991                 tcp_clear_md5_list(sk);
1992                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1993                 tp->md5sig_info = NULL;
1994         }
1995 #endif
1996
1997         /* Clean up a referenced TCP bind bucket. */
1998         if (inet_csk(sk)->icsk_bind_hash)
1999                 inet_put_port(sk);
2000
2001         BUG_ON(tp->fastopen_rsk);
2002
2003         /* If socket is aborted during connect operation */
2004         tcp_free_fastopen_req(tp);
2005         tcp_fastopen_destroy_cipher(sk);
2006         tcp_saved_syn_free(tp);
2007
2008         sk_sockets_allocated_dec(sk);
2009 }
2010 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2011
2012 #ifdef CONFIG_PROC_FS
2013 /* Proc filesystem TCP sock list dumping. */
2014
2015 /*
2016  * Get next listener socket follow cur.  If cur is NULL, get first socket
2017  * starting from bucket given in st->bucket; when st->bucket is zero the
2018  * very first socket in the hash table is returned.
2019  */
2020 static void *listening_get_next(struct seq_file *seq, void *cur)
2021 {
2022         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2023         struct tcp_iter_state *st = seq->private;
2024         struct net *net = seq_file_net(seq);
2025         struct inet_listen_hashbucket *ilb;
2026         struct sock *sk = cur;
2027
2028         if (!sk) {
2029 get_head:
2030                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2031                 spin_lock(&ilb->lock);
2032                 sk = sk_head(&ilb->head);
2033                 st->offset = 0;
2034                 goto get_sk;
2035         }
2036         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2037         ++st->num;
2038         ++st->offset;
2039
2040         sk = sk_next(sk);
2041 get_sk:
2042         sk_for_each_from(sk) {
2043                 if (!net_eq(sock_net(sk), net))
2044                         continue;
2045                 if (sk->sk_family == afinfo->family)
2046                         return sk;
2047         }
2048         spin_unlock(&ilb->lock);
2049         st->offset = 0;
2050         if (++st->bucket < INET_LHTABLE_SIZE)
2051                 goto get_head;
2052         return NULL;
2053 }
2054
2055 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2056 {
2057         struct tcp_iter_state *st = seq->private;
2058         void *rc;
2059
2060         st->bucket = 0;
2061         st->offset = 0;
2062         rc = listening_get_next(seq, NULL);
2063
2064         while (rc && *pos) {
2065                 rc = listening_get_next(seq, rc);
2066                 --*pos;
2067         }
2068         return rc;
2069 }
2070
2071 static inline bool empty_bucket(const struct tcp_iter_state *st)
2072 {
2073         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2074 }
2075
2076 /*
2077  * Get first established socket starting from bucket given in st->bucket.
2078  * If st->bucket is zero, the very first socket in the hash is returned.
2079  */
2080 static void *established_get_first(struct seq_file *seq)
2081 {
2082         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2083         struct tcp_iter_state *st = seq->private;
2084         struct net *net = seq_file_net(seq);
2085         void *rc = NULL;
2086
2087         st->offset = 0;
2088         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2089                 struct sock *sk;
2090                 struct hlist_nulls_node *node;
2091                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2092
2093                 /* Lockless fast path for the common case of empty buckets */
2094                 if (empty_bucket(st))
2095                         continue;
2096
2097                 spin_lock_bh(lock);
2098                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2099                         if (sk->sk_family != afinfo->family ||
2100                             !net_eq(sock_net(sk), net)) {
2101                                 continue;
2102                         }
2103                         rc = sk;
2104                         goto out;
2105                 }
2106                 spin_unlock_bh(lock);
2107         }
2108 out:
2109         return rc;
2110 }
2111
2112 static void *established_get_next(struct seq_file *seq, void *cur)
2113 {
2114         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2115         struct sock *sk = cur;
2116         struct hlist_nulls_node *node;
2117         struct tcp_iter_state *st = seq->private;
2118         struct net *net = seq_file_net(seq);
2119
2120         ++st->num;
2121         ++st->offset;
2122
2123         sk = sk_nulls_next(sk);
2124
2125         sk_nulls_for_each_from(sk, node) {
2126                 if (sk->sk_family == afinfo->family &&
2127                     net_eq(sock_net(sk), net))
2128                         return sk;
2129         }
2130
2131         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2132         ++st->bucket;
2133         return established_get_first(seq);
2134 }
2135
2136 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2137 {
2138         struct tcp_iter_state *st = seq->private;
2139         void *rc;
2140
2141         st->bucket = 0;
2142         rc = established_get_first(seq);
2143
2144         while (rc && pos) {
2145                 rc = established_get_next(seq, rc);
2146                 --pos;
2147         }
2148         return rc;
2149 }
2150
2151 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2152 {
2153         void *rc;
2154         struct tcp_iter_state *st = seq->private;
2155
2156         st->state = TCP_SEQ_STATE_LISTENING;
2157         rc        = listening_get_idx(seq, &pos);
2158
2159         if (!rc) {
2160                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2161                 rc        = established_get_idx(seq, pos);
2162         }
2163
2164         return rc;
2165 }
2166
2167 static void *tcp_seek_last_pos(struct seq_file *seq)
2168 {
2169         struct tcp_iter_state *st = seq->private;
2170         int offset = st->offset;
2171         int orig_num = st->num;
2172         void *rc = NULL;
2173
2174         switch (st->state) {
2175         case TCP_SEQ_STATE_LISTENING:
2176                 if (st->bucket >= INET_LHTABLE_SIZE)
2177                         break;
2178                 st->state = TCP_SEQ_STATE_LISTENING;
2179                 rc = listening_get_next(seq, NULL);
2180                 while (offset-- && rc)
2181                         rc = listening_get_next(seq, rc);
2182                 if (rc)
2183                         break;
2184                 st->bucket = 0;
2185                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2186                 /* Fallthrough */
2187         case TCP_SEQ_STATE_ESTABLISHED:
2188                 if (st->bucket > tcp_hashinfo.ehash_mask)
2189                         break;
2190                 rc = established_get_first(seq);
2191                 while (offset-- && rc)
2192                         rc = established_get_next(seq, rc);
2193         }
2194
2195         st->num = orig_num;
2196
2197         return rc;
2198 }
2199
2200 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2201 {
2202         struct tcp_iter_state *st = seq->private;
2203         void *rc;
2204
2205         if (*pos && *pos == st->last_pos) {
2206                 rc = tcp_seek_last_pos(seq);
2207                 if (rc)
2208                         goto out;
2209         }
2210
2211         st->state = TCP_SEQ_STATE_LISTENING;
2212         st->num = 0;
2213         st->bucket = 0;
2214         st->offset = 0;
2215         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2216
2217 out:
2218         st->last_pos = *pos;
2219         return rc;
2220 }
2221 EXPORT_SYMBOL(tcp_seq_start);
2222
2223 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2224 {
2225         struct tcp_iter_state *st = seq->private;
2226         void *rc = NULL;
2227
2228         if (v == SEQ_START_TOKEN) {
2229                 rc = tcp_get_idx(seq, 0);
2230                 goto out;
2231         }
2232
2233         switch (st->state) {
2234         case TCP_SEQ_STATE_LISTENING:
2235                 rc = listening_get_next(seq, v);
2236                 if (!rc) {
2237                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2238                         st->bucket = 0;
2239                         st->offset = 0;
2240                         rc        = established_get_first(seq);
2241                 }
2242                 break;
2243         case TCP_SEQ_STATE_ESTABLISHED:
2244                 rc = established_get_next(seq, v);
2245                 break;
2246         }
2247 out:
2248         ++*pos;
2249         st->last_pos = *pos;
2250         return rc;
2251 }
2252 EXPORT_SYMBOL(tcp_seq_next);
2253
2254 void tcp_seq_stop(struct seq_file *seq, void *v)
2255 {
2256         struct tcp_iter_state *st = seq->private;
2257
2258         switch (st->state) {
2259         case TCP_SEQ_STATE_LISTENING:
2260                 if (v != SEQ_START_TOKEN)
2261                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2262                 break;
2263         case TCP_SEQ_STATE_ESTABLISHED:
2264                 if (v)
2265                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2266                 break;
2267         }
2268 }
2269 EXPORT_SYMBOL(tcp_seq_stop);
2270
2271 static void get_openreq4(const struct request_sock *req,
2272                          struct seq_file *f, int i)
2273 {
2274         const struct inet_request_sock *ireq = inet_rsk(req);
2275         long delta = req->rsk_timer.expires - jiffies;
2276
2277         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2278                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2279                 i,
2280                 ireq->ir_loc_addr,
2281                 ireq->ir_num,
2282                 ireq->ir_rmt_addr,
2283                 ntohs(ireq->ir_rmt_port),
2284                 TCP_SYN_RECV,
2285                 0, 0, /* could print option size, but that is af dependent. */
2286                 1,    /* timers active (only the expire timer) */
2287                 jiffies_delta_to_clock_t(delta),
2288                 req->num_timeout,
2289                 from_kuid_munged(seq_user_ns(f),
2290                                  sock_i_uid(req->rsk_listener)),
2291                 0,  /* non standard timer */
2292                 0, /* open_requests have no inode */
2293                 0,
2294                 req);
2295 }
2296
2297 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2298 {
2299         int timer_active;
2300         unsigned long timer_expires;
2301         const struct tcp_sock *tp = tcp_sk(sk);
2302         const struct inet_connection_sock *icsk = inet_csk(sk);
2303         const struct inet_sock *inet = inet_sk(sk);
2304         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2305         __be32 dest = inet->inet_daddr;
2306         __be32 src = inet->inet_rcv_saddr;
2307         __u16 destp = ntohs(inet->inet_dport);
2308         __u16 srcp = ntohs(inet->inet_sport);
2309         int rx_queue;
2310         int state;
2311
2312         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2313             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2314             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2315                 timer_active    = 1;
2316                 timer_expires   = icsk->icsk_timeout;
2317         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2318                 timer_active    = 4;
2319                 timer_expires   = icsk->icsk_timeout;
2320         } else if (timer_pending(&sk->sk_timer)) {
2321                 timer_active    = 2;
2322                 timer_expires   = sk->sk_timer.expires;
2323         } else {
2324                 timer_active    = 0;
2325                 timer_expires = jiffies;
2326         }
2327
2328         state = inet_sk_state_load(sk);
2329         if (state == TCP_LISTEN)
2330                 rx_queue = sk->sk_ack_backlog;
2331         else
2332                 /* Because we don't lock the socket,
2333                  * we might find a transient negative value.
2334                  */
2335                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2336
2337         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2338                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2339                 i, src, srcp, dest, destp, state,
2340                 tp->write_seq - tp->snd_una,
2341                 rx_queue,
2342                 timer_active,
2343                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2344                 icsk->icsk_retransmits,
2345                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2346                 icsk->icsk_probes_out,
2347                 sock_i_ino(sk),
2348                 refcount_read(&sk->sk_refcnt), sk,
2349                 jiffies_to_clock_t(icsk->icsk_rto),
2350                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2351                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2352                 tp->snd_cwnd,
2353                 state == TCP_LISTEN ?
2354                     fastopenq->max_qlen :
2355                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2356 }
2357
2358 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2359                                struct seq_file *f, int i)
2360 {
2361         long delta = tw->tw_timer.expires - jiffies;
2362         __be32 dest, src;
2363         __u16 destp, srcp;
2364
2365         dest  = tw->tw_daddr;
2366         src   = tw->tw_rcv_saddr;
2367         destp = ntohs(tw->tw_dport);
2368         srcp  = ntohs(tw->tw_sport);
2369
2370         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2371                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2372                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2373                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2374                 refcount_read(&tw->tw_refcnt), tw);
2375 }
2376
2377 #define TMPSZ 150
2378
2379 static int tcp4_seq_show(struct seq_file *seq, void *v)
2380 {
2381         struct tcp_iter_state *st;
2382         struct sock *sk = v;
2383
2384         seq_setwidth(seq, TMPSZ - 1);
2385         if (v == SEQ_START_TOKEN) {
2386                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2387                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2388                            "inode");
2389                 goto out;
2390         }
2391         st = seq->private;
2392
2393         if (sk->sk_state == TCP_TIME_WAIT)
2394                 get_timewait4_sock(v, seq, st->num);
2395         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2396                 get_openreq4(v, seq, st->num);
2397         else
2398                 get_tcp4_sock(v, seq, st->num);
2399 out:
2400         seq_pad(seq, '\n');
2401         return 0;
2402 }
2403
2404 static const struct seq_operations tcp4_seq_ops = {
2405         .show           = tcp4_seq_show,
2406         .start          = tcp_seq_start,
2407         .next           = tcp_seq_next,
2408         .stop           = tcp_seq_stop,
2409 };
2410
2411 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2412         .family         = AF_INET,
2413 };
2414
2415 static int __net_init tcp4_proc_init_net(struct net *net)
2416 {
2417         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2418                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2419                 return -ENOMEM;
2420         return 0;
2421 }
2422
2423 static void __net_exit tcp4_proc_exit_net(struct net *net)
2424 {
2425         remove_proc_entry("tcp", net->proc_net);
2426 }
2427
2428 static struct pernet_operations tcp4_net_ops = {
2429         .init = tcp4_proc_init_net,
2430         .exit = tcp4_proc_exit_net,
2431 };
2432
2433 int __init tcp4_proc_init(void)
2434 {
2435         return register_pernet_subsys(&tcp4_net_ops);
2436 }
2437
2438 void tcp4_proc_exit(void)
2439 {
2440         unregister_pernet_subsys(&tcp4_net_ops);
2441 }
2442 #endif /* CONFIG_PROC_FS */
2443
2444 struct proto tcp_prot = {
2445         .name                   = "TCP",
2446         .owner                  = THIS_MODULE,
2447         .close                  = tcp_close,
2448         .pre_connect            = tcp_v4_pre_connect,
2449         .connect                = tcp_v4_connect,
2450         .disconnect             = tcp_disconnect,
2451         .accept                 = inet_csk_accept,
2452         .ioctl                  = tcp_ioctl,
2453         .init                   = tcp_v4_init_sock,
2454         .destroy                = tcp_v4_destroy_sock,
2455         .shutdown               = tcp_shutdown,
2456         .setsockopt             = tcp_setsockopt,
2457         .getsockopt             = tcp_getsockopt,
2458         .keepalive              = tcp_set_keepalive,
2459         .recvmsg                = tcp_recvmsg,
2460         .sendmsg                = tcp_sendmsg,
2461         .sendpage               = tcp_sendpage,
2462         .backlog_rcv            = tcp_v4_do_rcv,
2463         .release_cb             = tcp_release_cb,
2464         .hash                   = inet_hash,
2465         .unhash                 = inet_unhash,
2466         .get_port               = inet_csk_get_port,
2467         .enter_memory_pressure  = tcp_enter_memory_pressure,
2468         .leave_memory_pressure  = tcp_leave_memory_pressure,
2469         .stream_memory_free     = tcp_stream_memory_free,
2470         .sockets_allocated      = &tcp_sockets_allocated,
2471         .orphan_count           = &tcp_orphan_count,
2472         .memory_allocated       = &tcp_memory_allocated,
2473         .memory_pressure        = &tcp_memory_pressure,
2474         .sysctl_mem             = sysctl_tcp_mem,
2475         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2476         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2477         .max_header             = MAX_TCP_HEADER,
2478         .obj_size               = sizeof(struct tcp_sock),
2479         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2480         .twsk_prot              = &tcp_timewait_sock_ops,
2481         .rsk_prot               = &tcp_request_sock_ops,
2482         .h.hashinfo             = &tcp_hashinfo,
2483         .no_autobind            = true,
2484 #ifdef CONFIG_COMPAT
2485         .compat_setsockopt      = compat_tcp_setsockopt,
2486         .compat_getsockopt      = compat_tcp_getsockopt,
2487 #endif
2488         .diag_destroy           = tcp_abort,
2489 };
2490 EXPORT_SYMBOL(tcp_prot);
2491
2492 static void __net_exit tcp_sk_exit(struct net *net)
2493 {
2494         int cpu;
2495
2496         module_put(net->ipv4.tcp_congestion_control->owner);
2497
2498         for_each_possible_cpu(cpu)
2499                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2500         free_percpu(net->ipv4.tcp_sk);
2501 }
2502
2503 static int __net_init tcp_sk_init(struct net *net)
2504 {
2505         int res, cpu, cnt;
2506
2507         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2508         if (!net->ipv4.tcp_sk)
2509                 return -ENOMEM;
2510
2511         for_each_possible_cpu(cpu) {
2512                 struct sock *sk;
2513
2514                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2515                                            IPPROTO_TCP, net);
2516                 if (res)
2517                         goto fail;
2518                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2519                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2520         }
2521
2522         net->ipv4.sysctl_tcp_ecn = 2;
2523         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2524
2525         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2526         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2527         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2528
2529         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2530         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2531         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2532
2533         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2534         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2535         net->ipv4.sysctl_tcp_syncookies = 1;
2536         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2537         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2538         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2539         net->ipv4.sysctl_tcp_orphan_retries = 0;
2540         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2541         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2542         net->ipv4.sysctl_tcp_tw_reuse = 2;
2543
2544         cnt = tcp_hashinfo.ehash_mask + 1;
2545         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2546         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2547
2548         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2549         net->ipv4.sysctl_tcp_sack = 1;
2550         net->ipv4.sysctl_tcp_window_scaling = 1;
2551         net->ipv4.sysctl_tcp_timestamps = 1;
2552         net->ipv4.sysctl_tcp_early_retrans = 3;
2553         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2554         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2555         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2556         net->ipv4.sysctl_tcp_max_reordering = 300;
2557         net->ipv4.sysctl_tcp_dsack = 1;
2558         net->ipv4.sysctl_tcp_app_win = 31;
2559         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2560         net->ipv4.sysctl_tcp_frto = 2;
2561         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2562         /* This limits the percentage of the congestion window which we
2563          * will allow a single TSO frame to consume.  Building TSO frames
2564          * which are too large can cause TCP streams to be bursty.
2565          */
2566         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2567         /* Default TSQ limit of four TSO segments */
2568         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2569         /* rfc5961 challenge ack rate limiting */
2570         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2571         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2572         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2573         net->ipv4.sysctl_tcp_autocorking = 1;
2574         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2575         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2576         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2577         if (net != &init_net) {
2578                 memcpy(net->ipv4.sysctl_tcp_rmem,
2579                        init_net.ipv4.sysctl_tcp_rmem,
2580                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2581                 memcpy(net->ipv4.sysctl_tcp_wmem,
2582                        init_net.ipv4.sysctl_tcp_wmem,
2583                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2584         }
2585         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2586         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2587         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2588         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2589         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2590         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2591
2592         /* Reno is always built in */
2593         if (!net_eq(net, &init_net) &&
2594             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2595                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2596         else
2597                 net->ipv4.tcp_congestion_control = &tcp_reno;
2598
2599         return 0;
2600 fail:
2601         tcp_sk_exit(net);
2602
2603         return res;
2604 }
2605
2606 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2607 {
2608         struct net *net;
2609
2610         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2611
2612         list_for_each_entry(net, net_exit_list, exit_list)
2613                 tcp_fastopen_ctx_destroy(net);
2614 }
2615
2616 static struct pernet_operations __net_initdata tcp_sk_ops = {
2617        .init       = tcp_sk_init,
2618        .exit       = tcp_sk_exit,
2619        .exit_batch = tcp_sk_exit_batch,
2620 };
2621
2622 void __init tcp_v4_init(void)
2623 {
2624         if (register_pernet_subsys(&tcp_sk_ops))
2625                 panic("Failed to create the TCP control socket.\n");
2626 }