net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         /* With PAWS, it is safe from the viewpoint
 117            of data integrity. Even without PAWS it is safe provided sequence
 118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120            Actually, the idea is close to VJ's one, only timestamp cache is
 121            held not per host, but per port pair and TW bucket is used as state
 122            holder.
 123
 124            If TW bucket has been already destroyed we fall back to VJ's scheme
 125            and use initial timestamp retrieved from peer table.
 126          */
 127         if (tcptw->tw_ts_recent_stamp &&
 128             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                 if (tp->write_seq == 0)
 132                         tp->write_seq = 1;
 133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                 sock_hold(sktw);
 136                 return 1;
 137         }
 138
 139         return 0;
 140 }
 141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 142
 143 /* This will initiate an outgoing connection. */
 144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 145 {
 146         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 147         struct inet_sock *inet = inet_sk(sk);
 148         struct tcp_sock *tp = tcp_sk(sk);
 149         __be16 orig_sport, orig_dport;
 150         __be32 daddr, nexthop;
 151         struct flowi4 *fl4;
 152         struct rtable *rt;
 153         int err;
 154         struct ip_options_rcu *inet_opt;
 155         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 156
 157         if (addr_len < sizeof(struct sockaddr_in))
 158                 return -EINVAL;
 159
 160         if (usin->sin_family != AF_INET)
 161                 return -EAFNOSUPPORT;
 162
 163         nexthop = daddr = usin->sin_addr.s_addr;
 164         inet_opt = rcu_dereference_protected(inet->inet_opt,
 165                                              lockdep_sock_is_held(sk));
 166         if (inet_opt && inet_opt->opt.srr) {
 167                 if (!daddr)
 168                         return -EINVAL;
 169                 nexthop = inet_opt->opt.faddr;
 170         }
 171
 172         orig_sport = inet->inet_sport;
 173         orig_dport = usin->sin_port;
 174         fl4 = &inet->cork.fl.u.ip4;
 175         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 176                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 177                               IPPROTO_TCP,
 178                               orig_sport, orig_dport, sk);
 179         if (IS_ERR(rt)) {
 180                 err = PTR_ERR(rt);
 181                 if (err == -ENETUNREACH)
 182                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 183                 return err;
 184         }
 185
 186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187                 ip_rt_put(rt);
 188                 return -ENETUNREACH;
 189         }
 190
 191         if (!inet_opt || !inet_opt->opt.srr)
 192                 daddr = fl4->daddr;
 193
 194         if (!inet->inet_saddr)
 195                 inet->inet_saddr = fl4->saddr;
 196         sk_rcv_saddr_set(sk, inet->inet_saddr);
 197
 198         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 199                 /* Reset inherited state */
 200                 tp->rx_opt.ts_recent       = 0;
 201                 tp->rx_opt.ts_recent_stamp = 0;
 202                 if (likely(!tp->repair))
 203                         tp->write_seq      = 0;
 204         }
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237         rt = NULL;
 238
 239         if (likely(!tp->repair)) {
 240                 if (!tp->write_seq)
 241                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 242                                                        inet->inet_daddr,
 243                                                        inet->inet_sport,
 244                                                        usin->sin_port);
 245                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 246                                                  inet->inet_saddr,
 247                                                  inet->inet_daddr);
 248         }
 249
 250         inet->inet_id = tp->write_seq ^ jiffies;
 251
 252         if (tcp_fastopen_defer_connect(sk, &err))
 253                 return err;
 254         if (err)
 255                 goto failure;
 256
 257         err = tcp_connect(sk);
 258
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279  * It can be called through tcp_release_cb() if socket was owned by user
 280  * at the time tcp_v4_err() was called to handle ICMP message.
 281  */
 282 void tcp_v4_mtu_reduced(struct sock *sk)
 283 {
 284         struct inet_sock *inet = inet_sk(sk);
 285         struct dst_entry *dst;
 286         u32 mtu;
 287
 288         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 289                 return;
 290         mtu = tcp_sk(sk)->mtu_info;
 291         dst = inet_csk_update_pmtu(sk, mtu);
 292         if (!dst)
 293                 return;
 294
 295         /* Something is about to be wrong... Remember soft error
 296          * for the case, if this connection will not able to recover.
 297          */
 298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                 sk->sk_err_soft = EMSGSIZE;
 300
 301         mtu = dst_mtu(dst);
 302
 303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304             ip_sk_accept_pmtu(sk) &&
 305             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 306                 tcp_sync_mss(sk, mtu);
 307
 308                 /* Resend the TCP packet because it's
 309                  * clear that the old packet has been
 310                  * dropped. This is the new "fast" path mtu
 311                  * discovery.
 312                  */
 313                 tcp_simple_retransmit(sk);
 314         } /* else let the usual retransmit timer handle it */
 315 }
 316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 317
 318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 319 {
 320         struct dst_entry *dst = __sk_dst_check(sk, 0);
 321
 322         if (dst)
 323                 dst->ops->redirect(dst, sk, skb);
 324 }
 325
 326
 327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 329 {
 330         struct request_sock *req = inet_reqsk(sk);
 331         struct net *net = sock_net(sk);
 332
 333         /* ICMPs are not backlogged, hence we cannot get
 334          * an established socket here.
 335          */
 336         if (seq != tcp_rsk(req)->snt_isn) {
 337                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 338         } else if (abort) {
 339                 /*
 340                  * Still in SYN_RECV, just remove it silently.
 341                  * There is no good way to pass the error to the newly
 342                  * created socket, and POSIX does not want network
 343                  * errors returned from accept().
 344                  */
 345                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 346                 tcp_listendrop(req->rsk_listener);
 347         }
 348         reqsk_put(req);
 349 }
 350 EXPORT_SYMBOL(tcp_req_err);
 351
 352 /*
 353  * This routine is called by the ICMP module when it gets some
 354  * sort of error condition.  If err < 0 then the socket should
 355  * be closed and the error returned to the user.  If err > 0
 356  * it's just the icmp type << 8 | icmp code.  After adjustment
 357  * header points to the first 8 bytes of the tcp header.  We need
 358  * to find the appropriate port.
 359  *
 360  * The locking strategy used here is very "optimistic". When
 361  * someone else accesses the socket the ICMP is just dropped
 362  * and for some paths there is no check at all.
 363  * A more general error queue to queue errors for later handling
 364  * is probably better.
 365  *
 366  */
 367
 368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 369 {
 370         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 371         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 372         struct inet_connection_sock *icsk;
 373         struct tcp_sock *tp;
 374         struct inet_sock *inet;
 375         const int type = icmp_hdr(icmp_skb)->type;
 376         const int code = icmp_hdr(icmp_skb)->code;
 377         struct sock *sk;
 378         struct sk_buff *skb;
 379         struct request_sock *fastopen;
 380         u32 seq, snd_una;
 381         s32 remaining;
 382         u32 delta_us;
 383         int err;
 384         struct net *net = dev_net(icmp_skb->dev);
 385
 386         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 387                                        th->dest, iph->saddr, ntohs(th->source),
 388                                        inet_iif(icmp_skb), 0);
 389         if (!sk) {
 390                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 391                 return;
 392         }
 393         if (sk->sk_state == TCP_TIME_WAIT) {
 394                 inet_twsk_put(inet_twsk(sk));
 395                 return;
 396         }
 397         seq = ntohl(th->seq);
 398         if (sk->sk_state == TCP_NEW_SYN_RECV)
 399                 return tcp_req_err(sk, seq,
 400                                   type == ICMP_PARAMETERPROB ||
 401                                   type == ICMP_TIME_EXCEEDED ||
 402                                   (type == ICMP_DEST_UNREACH &&
 403                                    (code == ICMP_NET_UNREACH ||
 404                                     code == ICMP_HOST_UNREACH)));
 405
 406         bh_lock_sock(sk);
 407         /* If too many ICMPs get dropped on busy
 408          * servers this needs to be solved differently.
 409          * We do take care of PMTU discovery (RFC1191) special case :
 410          * we can receive locally generated ICMP messages while socket is held.
 411          */
 412         if (sock_owned_by_user(sk)) {
 413                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 414                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 415         }
 416         if (sk->sk_state == TCP_CLOSE)
 417                 goto out;
 418
 419         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 420                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 421                 goto out;
 422         }
 423
 424         icsk = inet_csk(sk);
 425         tp = tcp_sk(sk);
 426         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 427         fastopen = tp->fastopen_rsk;
 428         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 429         if (sk->sk_state != TCP_LISTEN &&
 430             !between(seq, snd_una, tp->snd_nxt)) {
 431                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 432                 goto out;
 433         }
 434
 435         switch (type) {
 436         case ICMP_REDIRECT:
 437                 if (!sock_owned_by_user(sk))
 438                         do_redirect(icmp_skb, sk);
 439                 goto out;
 440         case ICMP_SOURCE_QUENCH:
 441                 /* Just silently ignore these. */
 442                 goto out;
 443         case ICMP_PARAMETERPROB:
 444                 err = EPROTO;
 445                 break;
 446         case ICMP_DEST_UNREACH:
 447                 if (code > NR_ICMP_UNREACH)
 448                         goto out;
 449
 450                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 451                         /* We are not interested in TCP_LISTEN and open_requests
 452                          * (SYN-ACKs send out by Linux are always <576bytes so
 453                          * they should go through unfragmented).
 454                          */
 455                         if (sk->sk_state == TCP_LISTEN)
 456                                 goto out;
 457
 458                         tp->mtu_info = info;
 459                         if (!sock_owned_by_user(sk)) {
 460                                 tcp_v4_mtu_reduced(sk);
 461                         } else {
 462                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 463                                         sock_hold(sk);
 464                         }
 465                         goto out;
 466                 }
 467
 468                 err = icmp_err_convert[code].errno;
 469                 /* check if icmp_skb allows revert of backoff
 470                  * (see draft-zimmermann-tcp-lcd) */
 471                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 472                         break;
 473                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 474                     !icsk->icsk_backoff || fastopen)
 475                         break;
 476
 477                 if (sock_owned_by_user(sk))
 478                         break;
 479
 480                 icsk->icsk_backoff--;
 481                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 482                                                TCP_TIMEOUT_INIT;
 483                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 484
 485                 skb = tcp_rtx_queue_head(sk);
 486                 BUG_ON(!skb);
 487
 488                 tcp_mstamp_refresh(tp);
 489                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 490                 remaining = icsk->icsk_rto -
 491                             usecs_to_jiffies(delta_us);
 492
 493                 if (remaining > 0) {
 494                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 495                                                   remaining, TCP_RTO_MAX);
 496                 } else {
 497                         /* RTO revert clocked out retransmission.
 498                          * Will retransmit now */
 499                         tcp_retransmit_timer(sk);
 500                 }
 501
 502                 break;
 503         case ICMP_TIME_EXCEEDED:
 504                 err = EHOSTUNREACH;
 505                 break;
 506         default:
 507                 goto out;
 508         }
 509
 510         switch (sk->sk_state) {
 511         case TCP_SYN_SENT:
 512         case TCP_SYN_RECV:
 513                 /* Only in fast or simultaneous open. If a fast open socket is
 514                  * is already accepted it is treated as a connected one below.
 515                  */
 516                 if (fastopen && !fastopen->sk)
 517                         break;
 518
 519                 if (!sock_owned_by_user(sk)) {
 520                         sk->sk_err = err;
 521
 522                         sk->sk_error_report(sk);
 523
 524                         tcp_done(sk);
 525                 } else {
 526                         sk->sk_err_soft = err;
 527                 }
 528                 goto out;
 529         }
 530
 531         /* If we've already connected we will keep trying
 532          * until we time out, or the user gives up.
 533          *
 534          * rfc1122 4.2.3.9 allows to consider as hard errors
 535          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 536          * but it is obsoleted by pmtu discovery).
 537          *
 538          * Note, that in modern internet, where routing is unreliable
 539          * and in each dark corner broken firewalls sit, sending random
 540          * errors ordered by their masters even this two messages finally lose
 541          * their original sense (even Linux sends invalid PORT_UNREACHs)
 542          *
 543          * Now we are in compliance with RFCs.
 544          *                                                      --ANK (980905)
 545          */
 546
 547         inet = inet_sk(sk);
 548         if (!sock_owned_by_user(sk) && inet->recverr) {
 549                 sk->sk_err = err;
 550                 sk->sk_error_report(sk);
 551         } else  { /* Only an error on timeout */
 552                 sk->sk_err_soft = err;
 553         }
 554
 555 out:
 556         bh_unlock_sock(sk);
 557         sock_put(sk);
 558 }
 559
 560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 561 {
 562         struct tcphdr *th = tcp_hdr(skb);
 563
 564         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                 skb->csum_start = skb_transport_header(skb) - skb->head;
 567                 skb->csum_offset = offsetof(struct tcphdr, check);
 568         } else {
 569                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                          csum_partial(th,
 571                                                       th->doff << 2,
 572                                                       skb->csum));
 573         }
 574 }
 575
 576 /* This routine computes an IPv4 TCP checksum. */
 577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578 {
 579         const struct inet_sock *inet = inet_sk(sk);
 580
 581         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582 }
 583 EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585 /*
 586  *      This routine will send an RST to the other tcp.
 587  *
 588  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 589  *                    for reset.
 590  *      Answer: if a packet caused RST, it is not for a socket
 591  *              existing in our system, if it is matched to a socket,
 592  *              it is just duplicate segment or bug in other side's TCP.
 593  *              So that we build reply only basing on parameters
 594  *              arrived with segment.
 595  *      Exception: precedence violation. We do not implement it in any case.
 596  */
 597
 598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 599 {
 600         const struct tcphdr *th = tcp_hdr(skb);
 601         struct {
 602                 struct tcphdr th;
 603 #ifdef CONFIG_TCP_MD5SIG
 604                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 605 #endif
 606         } rep;
 607         struct ip_reply_arg arg;
 608 #ifdef CONFIG_TCP_MD5SIG
 609         struct tcp_md5sig_key *key = NULL;
 610         const __u8 *hash_location = NULL;
 611         unsigned char newhash[16];
 612         int genhash;
 613         struct sock *sk1 = NULL;
 614 #endif
 615         struct net *net;
 616
 617         /* Never send a reset in response to a reset. */
 618         if (th->rst)
 619                 return;
 620
 621         /* If sk not NULL, it means we did a successful lookup and incoming
 622          * route had to be correct. prequeue might have dropped our dst.
 623          */
 624         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 625                 return;
 626
 627         /* Swap the send and the receive. */
 628         memset(&rep, 0, sizeof(rep));
 629         rep.th.dest   = th->source;
 630         rep.th.source = th->dest;
 631         rep.th.doff   = sizeof(struct tcphdr) / 4;
 632         rep.th.rst    = 1;
 633
 634         if (th->ack) {
 635                 rep.th.seq = th->ack_seq;
 636         } else {
 637                 rep.th.ack = 1;
 638                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 639                                        skb->len - (th->doff << 2));
 640         }
 641
 642         memset(&arg, 0, sizeof(arg));
 643         arg.iov[0].iov_base = (unsigned char *)&rep;
 644         arg.iov[0].iov_len  = sizeof(rep.th);
 645
 646         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 647 #ifdef CONFIG_TCP_MD5SIG
 648         rcu_read_lock();
 649         hash_location = tcp_parse_md5sig_option(th);
 650         if (sk && sk_fullsock(sk)) {
 651                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 652                                         &ip_hdr(skb)->saddr, AF_INET);
 653         } else if (hash_location) {
 654                 /*
 655                  * active side is lost. Try to find listening socket through
 656                  * source port, and then find md5 key through listening socket.
 657                  * we are not loose security here:
 658                  * Incoming packet is checked with md5 hash with finding key,
 659                  * no RST generated if md5 hash doesn't match.
 660                  */
 661                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 662                                              ip_hdr(skb)->saddr,
 663                                              th->source, ip_hdr(skb)->daddr,
 664                                              ntohs(th->source), inet_iif(skb),
 665                                              tcp_v4_sdif(skb));
 666                 /* don't send rst if it can't find key */
 667                 if (!sk1)
 668                         goto out;
 669
 670                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 671                                         &ip_hdr(skb)->saddr, AF_INET);
 672                 if (!key)
 673                         goto out;
 674
 675
 676                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 677                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 678                         goto out;
 679
 680         }
 681
 682         if (key) {
 683                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 684                                    (TCPOPT_NOP << 16) |
 685                                    (TCPOPT_MD5SIG << 8) |
 686                                    TCPOLEN_MD5SIG);
 687                 /* Update length and the length the header thinks exists */
 688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 689                 rep.th.doff = arg.iov[0].iov_len / 4;
 690
 691                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 692                                      key, ip_hdr(skb)->saddr,
 693                                      ip_hdr(skb)->daddr, &rep.th);
 694         }
 695 #endif
 696         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 697                                       ip_hdr(skb)->saddr, /* XXX */
 698                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 699         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 700         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 701
 702         /* When socket is gone, all binding information is lost.
 703          * routing might fail in this case. No choice here, if we choose to force
 704          * input interface, we will misroute in case of asymmetric route.
 705          */
 706         if (sk) {
 707                 arg.bound_dev_if = sk->sk_bound_dev_if;
 708                 if (sk_fullsock(sk))
 709                         trace_tcp_send_reset(sk, skb);
 710         }
 711
 712         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 713                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 714
 715         arg.tos = ip_hdr(skb)->tos;
 716         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 717         local_bh_disable();
 718         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 719                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 720                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 721                               &arg, arg.iov[0].iov_len);
 722
 723         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 724         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 725         local_bh_enable();
 726
 727 #ifdef CONFIG_TCP_MD5SIG
 728 out:
 729         rcu_read_unlock();
 730 #endif
 731 }
 732
 733 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 734    outside socket context is ugly, certainly. What can I do?
 735  */
 736
 737 static void tcp_v4_send_ack(const struct sock *sk,
 738                             struct sk_buff *skb, u32 seq, u32 ack,
 739                             u32 win, u32 tsval, u32 tsecr, int oif,
 740                             struct tcp_md5sig_key *key,
 741                             int reply_flags, u8 tos)
 742 {
 743         const struct tcphdr *th = tcp_hdr(skb);
 744         struct {
 745                 struct tcphdr th;
 746                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 747 #ifdef CONFIG_TCP_MD5SIG
 748                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 749 #endif
 750                         ];
 751         } rep;
 752         struct net *net = sock_net(sk);
 753         struct ip_reply_arg arg;
 754
 755         memset(&rep.th, 0, sizeof(struct tcphdr));
 756         memset(&arg, 0, sizeof(arg));
 757
 758         arg.iov[0].iov_base = (unsigned char *)&rep;
 759         arg.iov[0].iov_len  = sizeof(rep.th);
 760         if (tsecr) {
 761                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_TIMESTAMP << 8) |
 763                                    TCPOLEN_TIMESTAMP);
 764                 rep.opt[1] = htonl(tsval);
 765                 rep.opt[2] = htonl(tsecr);
 766                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 767         }
 768
 769         /* Swap the send and the receive. */
 770         rep.th.dest    = th->source;
 771         rep.th.source  = th->dest;
 772         rep.th.doff    = arg.iov[0].iov_len / 4;
 773         rep.th.seq     = htonl(seq);
 774         rep.th.ack_seq = htonl(ack);
 775         rep.th.ack     = 1;
 776         rep.th.window  = htons(win);
 777
 778 #ifdef CONFIG_TCP_MD5SIG
 779         if (key) {
 780                 int offset = (tsecr) ? 3 : 0;
 781
 782                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 783                                           (TCPOPT_NOP << 16) |
 784                                           (TCPOPT_MD5SIG << 8) |
 785                                           TCPOLEN_MD5SIG);
 786                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 787                 rep.th.doff = arg.iov[0].iov_len/4;
 788
 789                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 790                                     key, ip_hdr(skb)->saddr,
 791                                     ip_hdr(skb)->daddr, &rep.th);
 792         }
 793 #endif
 794         arg.flags = reply_flags;
 795         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 796                                       ip_hdr(skb)->saddr, /* XXX */
 797                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 798         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 799         if (oif)
 800                 arg.bound_dev_if = oif;
 801         arg.tos = tos;
 802         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 803         local_bh_disable();
 804         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 805                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 806                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 807                               &arg, arg.iov[0].iov_len);
 808
 809         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 810         local_bh_enable();
 811 }
 812
 813 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 814 {
 815         struct inet_timewait_sock *tw = inet_twsk(sk);
 816         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 817
 818         tcp_v4_send_ack(sk, skb,
 819                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 820                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 821                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 822                         tcptw->tw_ts_recent,
 823                         tw->tw_bound_dev_if,
 824                         tcp_twsk_md5_key(tcptw),
 825                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 826                         tw->tw_tos
 827                         );
 828
 829         inet_twsk_put(tw);
 830 }
 831
 832 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 833                                   struct request_sock *req)
 834 {
 835         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 836          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 837          */
 838         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 839                                              tcp_sk(sk)->snd_nxt;
 840
 841         /* RFC 7323 2.3
 842          * The window field (SEG.WND) of every outgoing segment, with the
 843          * exception of <SYN> segments, MUST be right-shifted by
 844          * Rcv.Wind.Shift bits:
 845          */
 846         tcp_v4_send_ack(sk, skb, seq,
 847                         tcp_rsk(req)->rcv_nxt,
 848                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 849                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 850                         req->ts_recent,
 851                         0,
 852                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 853                                           AF_INET),
 854                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 855                         ip_hdr(skb)->tos);
 856 }
 857
 858 /*
 859  *      Send a SYN-ACK after having received a SYN.
 860  *      This still operates on a request_sock only, not on a big
 861  *      socket.
 862  */
 863 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 864                               struct flowi *fl,
 865                               struct request_sock *req,
 866                               struct tcp_fastopen_cookie *foc,
 867                               enum tcp_synack_type synack_type)
 868 {
 869         const struct inet_request_sock *ireq = inet_rsk(req);
 870         struct flowi4 fl4;
 871         int err = -1;
 872         struct sk_buff *skb;
 873
 874         /* First, grab a route. */
 875         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 876                 return -1;
 877
 878         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 879
 880         if (skb) {
 881                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 882
 883                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 884                                             ireq->ir_rmt_addr,
 885                                             ireq_opt_deref(ireq));
 886                 err = net_xmit_eval(err);
 887         }
 888
 889         return err;
 890 }
 891
 892 /*
 893  *      IPv4 request_sock destructor.
 894  */
 895 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 896 {
 897         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 898 }
 899
 900 #ifdef CONFIG_TCP_MD5SIG
 901 /*
 902  * RFC2385 MD5 checksumming requires a mapping of
 903  * IP address->MD5 Key.
 904  * We need to maintain these in the sk structure.
 905  */
 906
 907 /* Find the Key structure for an address.  */
 908 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 909                                          const union tcp_md5_addr *addr,
 910                                          int family)
 911 {
 912         const struct tcp_sock *tp = tcp_sk(sk);
 913         struct tcp_md5sig_key *key;
 914         const struct tcp_md5sig_info *md5sig;
 915         __be32 mask;
 916         struct tcp_md5sig_key *best_match = NULL;
 917         bool match;
 918
 919         /* caller either holds rcu_read_lock() or socket lock */
 920         md5sig = rcu_dereference_check(tp->md5sig_info,
 921                                        lockdep_sock_is_held(sk));
 922         if (!md5sig)
 923                 return NULL;
 924
 925         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 926                 if (key->family != family)
 927                         continue;
 928
 929                 if (family == AF_INET) {
 930                         mask = inet_make_mask(key->prefixlen);
 931                         match = (key->addr.a4.s_addr & mask) ==
 932                                 (addr->a4.s_addr & mask);
 933 #if IS_ENABLED(CONFIG_IPV6)
 934                 } else if (family == AF_INET6) {
 935                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 936                                                   key->prefixlen);
 937 #endif
 938                 } else {
 939                         match = false;
 940                 }
 941
 942                 if (match && (!best_match ||
 943                               key->prefixlen > best_match->prefixlen))
 944                         best_match = key;
 945         }
 946         return best_match;
 947 }
 948 EXPORT_SYMBOL(tcp_md5_do_lookup);
 949
 950 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 951                                                       const union tcp_md5_addr *addr,
 952                                                       int family, u8 prefixlen)
 953 {
 954         const struct tcp_sock *tp = tcp_sk(sk);
 955         struct tcp_md5sig_key *key;
 956         unsigned int size = sizeof(struct in_addr);
 957         const struct tcp_md5sig_info *md5sig;
 958
 959         /* caller either holds rcu_read_lock() or socket lock */
 960         md5sig = rcu_dereference_check(tp->md5sig_info,
 961                                        lockdep_sock_is_held(sk));
 962         if (!md5sig)
 963                 return NULL;
 964 #if IS_ENABLED(CONFIG_IPV6)
 965         if (family == AF_INET6)
 966                 size = sizeof(struct in6_addr);
 967 #endif
 968         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 969                 if (key->family != family)
 970                         continue;
 971                 if (!memcmp(&key->addr, addr, size) &&
 972                     key->prefixlen == prefixlen)
 973                         return key;
 974         }
 975         return NULL;
 976 }
 977
 978 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 979                                          const struct sock *addr_sk)
 980 {
 981         const union tcp_md5_addr *addr;
 982
 983         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 984         return tcp_md5_do_lookup(sk, addr, AF_INET);
 985 }
 986 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 987
 988 /* This can be called on a newly created socket, from other files */
 989 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 990                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 991                    gfp_t gfp)
 992 {
 993         /* Add Key to the list */
 994         struct tcp_md5sig_key *key;
 995         struct tcp_sock *tp = tcp_sk(sk);
 996         struct tcp_md5sig_info *md5sig;
 997
 998         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
 999         if (key) {
1000                 /* Pre-existing entry - just update that one. */
1001                 memcpy(key->key, newkey, newkeylen);
1002                 key->keylen = newkeylen;
1003                 return 0;
1004         }
1005
1006         md5sig = rcu_dereference_protected(tp->md5sig_info,
1007                                            lockdep_sock_is_held(sk));
1008         if (!md5sig) {
1009                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1010                 if (!md5sig)
1011                         return -ENOMEM;
1012
1013                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1014                 INIT_HLIST_HEAD(&md5sig->head);
1015                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1016         }
1017
1018         key = sock_kmalloc(sk, sizeof(*key), gfp);
1019         if (!key)
1020                 return -ENOMEM;
1021         if (!tcp_alloc_md5sig_pool()) {
1022                 sock_kfree_s(sk, key, sizeof(*key));
1023                 return -ENOMEM;
1024         }
1025
1026         memcpy(key->key, newkey, newkeylen);
1027         key->keylen = newkeylen;
1028         key->family = family;
1029         key->prefixlen = prefixlen;
1030         memcpy(&key->addr, addr,
1031                (family == AF_INET6) ? sizeof(struct in6_addr) :
1032                                       sizeof(struct in_addr));
1033         hlist_add_head_rcu(&key->node, &md5sig->head);
1034         return 0;
1035 }
1036 EXPORT_SYMBOL(tcp_md5_do_add);
1037
1038 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1039                    u8 prefixlen)
1040 {
1041         struct tcp_md5sig_key *key;
1042
1043         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1044         if (!key)
1045                 return -ENOENT;
1046         hlist_del_rcu(&key->node);
1047         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1048         kfree_rcu(key, rcu);
1049         return 0;
1050 }
1051 EXPORT_SYMBOL(tcp_md5_do_del);
1052
1053 static void tcp_clear_md5_list(struct sock *sk)
1054 {
1055         struct tcp_sock *tp = tcp_sk(sk);
1056         struct tcp_md5sig_key *key;
1057         struct hlist_node *n;
1058         struct tcp_md5sig_info *md5sig;
1059
1060         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1061
1062         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1063                 hlist_del_rcu(&key->node);
1064                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1065                 kfree_rcu(key, rcu);
1066         }
1067 }
1068
1069 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1070                                  char __user *optval, int optlen)
1071 {
1072         struct tcp_md5sig cmd;
1073         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1074         u8 prefixlen = 32;
1075
1076         if (optlen < sizeof(cmd))
1077                 return -EINVAL;
1078
1079         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1080                 return -EFAULT;
1081
1082         if (sin->sin_family != AF_INET)
1083                 return -EINVAL;
1084
1085         if (optname == TCP_MD5SIG_EXT &&
1086             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1087                 prefixlen = cmd.tcpm_prefixlen;
1088                 if (prefixlen > 32)
1089                         return -EINVAL;
1090         }
1091
1092         if (!cmd.tcpm_keylen)
1093                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1094                                       AF_INET, prefixlen);
1095
1096         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1097                 return -EINVAL;
1098
1099         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1100                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1101                               GFP_KERNEL);
1102 }
1103
1104 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1105                                    __be32 daddr, __be32 saddr,
1106                                    const struct tcphdr *th, int nbytes)
1107 {
1108         struct tcp4_pseudohdr *bp;
1109         struct scatterlist sg;
1110         struct tcphdr *_th;
1111
1112         bp = hp->scratch;
1113         bp->saddr = saddr;
1114         bp->daddr = daddr;
1115         bp->pad = 0;
1116         bp->protocol = IPPROTO_TCP;
1117         bp->len = cpu_to_be16(nbytes);
1118
1119         _th = (struct tcphdr *)(bp + 1);
1120         memcpy(_th, th, sizeof(*th));
1121         _th->check = 0;
1122
1123         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1124         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1125                                 sizeof(*bp) + sizeof(*th));
1126         return crypto_ahash_update(hp->md5_req);
1127 }
1128
1129 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1130                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1131 {
1132         struct tcp_md5sig_pool *hp;
1133         struct ahash_request *req;
1134
1135         hp = tcp_get_md5sig_pool();
1136         if (!hp)
1137                 goto clear_hash_noput;
1138         req = hp->md5_req;
1139
1140         if (crypto_ahash_init(req))
1141                 goto clear_hash;
1142         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1143                 goto clear_hash;
1144         if (tcp_md5_hash_key(hp, key))
1145                 goto clear_hash;
1146         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1147         if (crypto_ahash_final(req))
1148                 goto clear_hash;
1149
1150         tcp_put_md5sig_pool();
1151         return 0;
1152
1153 clear_hash:
1154         tcp_put_md5sig_pool();
1155 clear_hash_noput:
1156         memset(md5_hash, 0, 16);
1157         return 1;
1158 }
1159
1160 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1161                         const struct sock *sk,
1162                         const struct sk_buff *skb)
1163 {
1164         struct tcp_md5sig_pool *hp;
1165         struct ahash_request *req;
1166         const struct tcphdr *th = tcp_hdr(skb);
1167         __be32 saddr, daddr;
1168
1169         if (sk) { /* valid for establish/request sockets */
1170                 saddr = sk->sk_rcv_saddr;
1171                 daddr = sk->sk_daddr;
1172         } else {
1173                 const struct iphdr *iph = ip_hdr(skb);
1174                 saddr = iph->saddr;
1175                 daddr = iph->daddr;
1176         }
1177
1178         hp = tcp_get_md5sig_pool();
1179         if (!hp)
1180                 goto clear_hash_noput;
1181         req = hp->md5_req;
1182
1183         if (crypto_ahash_init(req))
1184                 goto clear_hash;
1185
1186         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1187                 goto clear_hash;
1188         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1189                 goto clear_hash;
1190         if (tcp_md5_hash_key(hp, key))
1191                 goto clear_hash;
1192         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1193         if (crypto_ahash_final(req))
1194                 goto clear_hash;
1195
1196         tcp_put_md5sig_pool();
1197         return 0;
1198
1199 clear_hash:
1200         tcp_put_md5sig_pool();
1201 clear_hash_noput:
1202         memset(md5_hash, 0, 16);
1203         return 1;
1204 }
1205 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1206
1207 #endif
1208
1209 /* Called with rcu_read_lock() */
1210 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1211                                     const struct sk_buff *skb)
1212 {
1213 #ifdef CONFIG_TCP_MD5SIG
1214         /*
1215          * This gets called for each TCP segment that arrives
1216          * so we want to be efficient.
1217          * We have 3 drop cases:
1218          * o No MD5 hash and one expected.
1219          * o MD5 hash and we're not expecting one.
1220          * o MD5 hash and its wrong.
1221          */
1222         const __u8 *hash_location = NULL;
1223         struct tcp_md5sig_key *hash_expected;
1224         const struct iphdr *iph = ip_hdr(skb);
1225         const struct tcphdr *th = tcp_hdr(skb);
1226         int genhash;
1227         unsigned char newhash[16];
1228
1229         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1230                                           AF_INET);
1231         hash_location = tcp_parse_md5sig_option(th);
1232
1233         /* We've parsed the options - do we have a hash? */
1234         if (!hash_expected && !hash_location)
1235                 return false;
1236
1237         if (hash_expected && !hash_location) {
1238                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1239                 return true;
1240         }
1241
1242         if (!hash_expected && hash_location) {
1243                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1244                 return true;
1245         }
1246
1247         /* Okay, so this is hash_expected and hash_location -
1248          * so we need to calculate the checksum.
1249          */
1250         genhash = tcp_v4_md5_hash_skb(newhash,
1251                                       hash_expected,
1252                                       NULL, skb);
1253
1254         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1255                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1256                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1257                                      &iph->saddr, ntohs(th->source),
1258                                      &iph->daddr, ntohs(th->dest),
1259                                      genhash ? " tcp_v4_calc_md5_hash failed"
1260                                      : "");
1261                 return true;
1262         }
1263         return false;
1264 #endif
1265         return false;
1266 }
1267
1268 static void tcp_v4_init_req(struct request_sock *req,
1269                             const struct sock *sk_listener,
1270                             struct sk_buff *skb)
1271 {
1272         struct inet_request_sock *ireq = inet_rsk(req);
1273         struct net *net = sock_net(sk_listener);
1274
1275         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1276         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1277         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1278 }
1279
1280 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1281                                           struct flowi *fl,
1282                                           const struct request_sock *req)
1283 {
1284         return inet_csk_route_req(sk, &fl->u.ip4, req);
1285 }
1286
1287 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1288         .family         =       PF_INET,
1289         .obj_size       =       sizeof(struct tcp_request_sock),
1290         .rtx_syn_ack    =       tcp_rtx_synack,
1291         .send_ack       =       tcp_v4_reqsk_send_ack,
1292         .destructor     =       tcp_v4_reqsk_destructor,
1293         .send_reset     =       tcp_v4_send_reset,
1294         .syn_ack_timeout =      tcp_syn_ack_timeout,
1295 };
1296
1297 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1298         .mss_clamp      =       TCP_MSS_DEFAULT,
1299 #ifdef CONFIG_TCP_MD5SIG
1300         .req_md5_lookup =       tcp_v4_md5_lookup,
1301         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1302 #endif
1303         .init_req       =       tcp_v4_init_req,
1304 #ifdef CONFIG_SYN_COOKIES
1305         .cookie_init_seq =      cookie_v4_init_sequence,
1306 #endif
1307         .route_req      =       tcp_v4_route_req,
1308         .init_seq       =       tcp_v4_init_seq,
1309         .init_ts_off    =       tcp_v4_init_ts_off,
1310         .send_synack    =       tcp_v4_send_synack,
1311 };
1312
1313 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1314 {
1315         /* Never answer to SYNs send to broadcast or multicast */
1316         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1317                 goto drop;
1318
1319         return tcp_conn_request(&tcp_request_sock_ops,
1320                                 &tcp_request_sock_ipv4_ops, sk, skb);
1321
1322 drop:
1323         tcp_listendrop(sk);
1324         return 0;
1325 }
1326 EXPORT_SYMBOL(tcp_v4_conn_request);
1327
1328
1329 /*
1330  * The three way handshake has completed - we got a valid synack -
1331  * now create the new socket.
1332  */
1333 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1334                                   struct request_sock *req,
1335                                   struct dst_entry *dst,
1336                                   struct request_sock *req_unhash,
1337                                   bool *own_req)
1338 {
1339         struct inet_request_sock *ireq;
1340         struct inet_sock *newinet;
1341         struct tcp_sock *newtp;
1342         struct sock *newsk;
1343 #ifdef CONFIG_TCP_MD5SIG
1344         struct tcp_md5sig_key *key;
1345 #endif
1346         struct ip_options_rcu *inet_opt;
1347
1348         if (sk_acceptq_is_full(sk))
1349                 goto exit_overflow;
1350
1351         newsk = tcp_create_openreq_child(sk, req, skb);
1352         if (!newsk)
1353                 goto exit_nonewsk;
1354
1355         newsk->sk_gso_type = SKB_GSO_TCPV4;
1356         inet_sk_rx_dst_set(newsk, skb);
1357
1358         newtp                 = tcp_sk(newsk);
1359         newinet               = inet_sk(newsk);
1360         ireq                  = inet_rsk(req);
1361         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1362         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1363         newsk->sk_bound_dev_if = ireq->ir_iif;
1364         newinet->inet_saddr   = ireq->ir_loc_addr;
1365         inet_opt              = rcu_dereference(ireq->ireq_opt);
1366         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1367         newinet->mc_index     = inet_iif(skb);
1368         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1369         newinet->rcv_tos      = ip_hdr(skb)->tos;
1370         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1371         if (inet_opt)
1372                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1373         newinet->inet_id = newtp->write_seq ^ jiffies;
1374
1375         if (!dst) {
1376                 dst = inet_csk_route_child_sock(sk, newsk, req);
1377                 if (!dst)
1378                         goto put_and_exit;
1379         } else {
1380                 /* syncookie case : see end of cookie_v4_check() */
1381         }
1382         sk_setup_caps(newsk, dst);
1383
1384         tcp_ca_openreq_child(newsk, dst);
1385
1386         tcp_sync_mss(newsk, dst_mtu(dst));
1387         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1388
1389         tcp_initialize_rcv_mss(newsk);
1390
1391 #ifdef CONFIG_TCP_MD5SIG
1392         /* Copy over the MD5 key from the original socket */
1393         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1394                                 AF_INET);
1395         if (key) {
1396                 /*
1397                  * We're using one, so create a matching key
1398                  * on the newsk structure. If we fail to get
1399                  * memory, then we end up not copying the key
1400                  * across. Shucks.
1401                  */
1402                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1403                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1404                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1405         }
1406 #endif
1407
1408         if (__inet_inherit_port(sk, newsk) < 0)
1409                 goto put_and_exit;
1410         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1411         if (likely(*own_req)) {
1412                 tcp_move_syn(newtp, req);
1413                 ireq->ireq_opt = NULL;
1414         } else {
1415                 newinet->inet_opt = NULL;
1416         }
1417         return newsk;
1418
1419 exit_overflow:
1420         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1421 exit_nonewsk:
1422         dst_release(dst);
1423 exit:
1424         tcp_listendrop(sk);
1425         return NULL;
1426 put_and_exit:
1427         newinet->inet_opt = NULL;
1428         inet_csk_prepare_forced_close(newsk);
1429         tcp_done(newsk);
1430         goto exit;
1431 }
1432 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1433
1434 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1435 {
1436 #ifdef CONFIG_SYN_COOKIES
1437         const struct tcphdr *th = tcp_hdr(skb);
1438
1439         if (!th->syn)
1440                 sk = cookie_v4_check(sk, skb);
1441 #endif
1442         return sk;
1443 }
1444
1445 /* The socket must have it's spinlock held when we get
1446  * here, unless it is a TCP_LISTEN socket.
1447  *
1448  * We have a potential double-lock case here, so even when
1449  * doing backlog processing we use the BH locking scheme.
1450  * This is because we cannot sleep with the original spinlock
1451  * held.
1452  */
1453 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1454 {
1455         struct sock *rsk;
1456
1457         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1458                 struct dst_entry *dst = sk->sk_rx_dst;
1459
1460                 sock_rps_save_rxhash(sk, skb);
1461                 sk_mark_napi_id(sk, skb);
1462                 if (dst) {
1463                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1464                             !dst->ops->check(dst, 0)) {
1465                                 dst_release(dst);
1466                                 sk->sk_rx_dst = NULL;
1467                         }
1468                 }
1469                 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1470                 return 0;
1471         }
1472
1473         if (tcp_checksum_complete(skb))
1474                 goto csum_err;
1475
1476         if (sk->sk_state == TCP_LISTEN) {
1477                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1478
1479                 if (!nsk)
1480                         goto discard;
1481                 if (nsk != sk) {
1482                         if (tcp_child_process(sk, nsk, skb)) {
1483                                 rsk = nsk;
1484                                 goto reset;
1485                         }
1486                         return 0;
1487                 }
1488         } else
1489                 sock_rps_save_rxhash(sk, skb);
1490
1491         if (tcp_rcv_state_process(sk, skb)) {
1492                 rsk = sk;
1493                 goto reset;
1494         }
1495         return 0;
1496
1497 reset:
1498         tcp_v4_send_reset(rsk, skb);
1499 discard:
1500         kfree_skb(skb);
1501         /* Be careful here. If this function gets more complicated and
1502          * gcc suffers from register pressure on the x86, sk (in %ebx)
1503          * might be destroyed here. This current version compiles correctly,
1504          * but you have been warned.
1505          */
1506         return 0;
1507
1508 csum_err:
1509         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1510         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1511         goto discard;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_do_rcv);
1514
1515 int tcp_v4_early_demux(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph;
1518         const struct tcphdr *th;
1519         struct sock *sk;
1520
1521         if (skb->pkt_type != PACKET_HOST)
1522                 return 0;
1523
1524         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1525                 return 0;
1526
1527         iph = ip_hdr(skb);
1528         th = tcp_hdr(skb);
1529
1530         if (th->doff < sizeof(struct tcphdr) / 4)
1531                 return 0;
1532
1533         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1534                                        iph->saddr, th->source,
1535                                        iph->daddr, ntohs(th->dest),
1536                                        skb->skb_iif, inet_sdif(skb));
1537         if (sk) {
1538                 skb->sk = sk;
1539                 skb->destructor = sock_edemux;
1540                 if (sk_fullsock(sk)) {
1541                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1542
1543                         if (dst)
1544                                 dst = dst_check(dst, 0);
1545                         if (dst &&
1546                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1547                                 skb_dst_set_noref(skb, dst);
1548                 }
1549         }
1550         return 0;
1551 }
1552
1553 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1554 {
1555         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1556
1557         /* Only socket owner can try to collapse/prune rx queues
1558          * to reduce memory overhead, so add a little headroom here.
1559          * Few sockets backlog are possibly concurrently non empty.
1560          */
1561         limit += 64*1024;
1562
1563         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1564          * we can fix skb->truesize to its real value to avoid future drops.
1565          * This is valid because skb is not yet charged to the socket.
1566          * It has been noticed pure SACK packets were sometimes dropped
1567          * (if cooked by drivers without copybreak feature).
1568          */
1569         skb_condense(skb);
1570
1571         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1572                 bh_unlock_sock(sk);
1573                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1574                 return true;
1575         }
1576         return false;
1577 }
1578 EXPORT_SYMBOL(tcp_add_backlog);
1579
1580 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1581 {
1582         struct tcphdr *th = (struct tcphdr *)skb->data;
1583         unsigned int eaten = skb->len;
1584         int err;
1585
1586         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1587         if (!err) {
1588                 eaten -= skb->len;
1589                 TCP_SKB_CB(skb)->end_seq -= eaten;
1590         }
1591         return err;
1592 }
1593 EXPORT_SYMBOL(tcp_filter);
1594
1595 static void tcp_v4_restore_cb(struct sk_buff *skb)
1596 {
1597         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1598                 sizeof(struct inet_skb_parm));
1599 }
1600
1601 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1602                            const struct tcphdr *th)
1603 {
1604         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1605          * barrier() makes sure compiler wont play fool^Waliasing games.
1606          */
1607         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1608                 sizeof(struct inet_skb_parm));
1609         barrier();
1610
1611         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1612         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1613                                     skb->len - th->doff * 4);
1614         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1615         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1616         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1617         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1618         TCP_SKB_CB(skb)->sacked  = 0;
1619         TCP_SKB_CB(skb)->has_rxtstamp =
1620                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1621 }
1622
1623 /*
1624  *      From tcp_input.c
1625  */
1626
1627 int tcp_v4_rcv(struct sk_buff *skb)
1628 {
1629         struct net *net = dev_net(skb->dev);
1630         int sdif = inet_sdif(skb);
1631         const struct iphdr *iph;
1632         const struct tcphdr *th;
1633         bool refcounted;
1634         struct sock *sk;
1635         int ret;
1636
1637         if (skb->pkt_type != PACKET_HOST)
1638                 goto discard_it;
1639
1640         /* Count it even if it's bad */
1641         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1642
1643         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1644                 goto discard_it;
1645
1646         th = (const struct tcphdr *)skb->data;
1647
1648         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1649                 goto bad_packet;
1650         if (!pskb_may_pull(skb, th->doff * 4))
1651                 goto discard_it;
1652
1653         /* An explanation is required here, I think.
1654          * Packet length and doff are validated by header prediction,
1655          * provided case of th->doff==0 is eliminated.
1656          * So, we defer the checks. */
1657
1658         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1659                 goto csum_error;
1660
1661         th = (const struct tcphdr *)skb->data;
1662         iph = ip_hdr(skb);
1663 lookup:
1664         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1665                                th->dest, sdif, &refcounted);
1666         if (!sk)
1667                 goto no_tcp_socket;
1668
1669 process:
1670         if (sk->sk_state == TCP_TIME_WAIT)
1671                 goto do_time_wait;
1672
1673         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1674                 struct request_sock *req = inet_reqsk(sk);
1675                 struct sock *nsk;
1676
1677                 sk = req->rsk_listener;
1678                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1679                         sk_drops_add(sk, skb);
1680                         reqsk_put(req);
1681                         goto discard_it;
1682                 }
1683                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1684                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1685                         goto lookup;
1686                 }
1687                 /* We own a reference on the listener, increase it again
1688                  * as we might lose it too soon.
1689                  */
1690                 sock_hold(sk);
1691                 refcounted = true;
1692                 nsk = NULL;
1693                 if (!tcp_filter(sk, skb)) {
1694                         th = (const struct tcphdr *)skb->data;
1695                         iph = ip_hdr(skb);
1696                         tcp_v4_fill_cb(skb, iph, th);
1697                         nsk = tcp_check_req(sk, skb, req, false);
1698                 }
1699                 if (!nsk) {
1700                         reqsk_put(req);
1701                         goto discard_and_relse;
1702                 }
1703                 if (nsk == sk) {
1704                         reqsk_put(req);
1705                         tcp_v4_restore_cb(skb);
1706                 } else if (tcp_child_process(sk, nsk, skb)) {
1707                         tcp_v4_send_reset(nsk, skb);
1708                         goto discard_and_relse;
1709                 } else {
1710                         sock_put(sk);
1711                         return 0;
1712                 }
1713         }
1714         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1715                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1716                 goto discard_and_relse;
1717         }
1718
1719         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1720                 goto discard_and_relse;
1721
1722         if (tcp_v4_inbound_md5_hash(sk, skb))
1723                 goto discard_and_relse;
1724
1725         nf_reset(skb);
1726
1727         if (tcp_filter(sk, skb))
1728                 goto discard_and_relse;
1729         th = (const struct tcphdr *)skb->data;
1730         iph = ip_hdr(skb);
1731         tcp_v4_fill_cb(skb, iph, th);
1732
1733         skb->dev = NULL;
1734
1735         if (sk->sk_state == TCP_LISTEN) {
1736                 ret = tcp_v4_do_rcv(sk, skb);
1737                 goto put_and_return;
1738         }
1739
1740         sk_incoming_cpu_update(sk);
1741
1742         bh_lock_sock_nested(sk);
1743         tcp_segs_in(tcp_sk(sk), skb);
1744         ret = 0;
1745         if (!sock_owned_by_user(sk)) {
1746                 ret = tcp_v4_do_rcv(sk, skb);
1747         } else if (tcp_add_backlog(sk, skb)) {
1748                 goto discard_and_relse;
1749         }
1750         bh_unlock_sock(sk);
1751
1752 put_and_return:
1753         if (refcounted)
1754                 sock_put(sk);
1755
1756         return ret;
1757
1758 no_tcp_socket:
1759         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1760                 goto discard_it;
1761
1762         tcp_v4_fill_cb(skb, iph, th);
1763
1764         if (tcp_checksum_complete(skb)) {
1765 csum_error:
1766                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1767 bad_packet:
1768                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1769         } else {
1770                 tcp_v4_send_reset(NULL, skb);
1771         }
1772
1773 discard_it:
1774         /* Discard frame. */
1775         kfree_skb(skb);
1776         return 0;
1777
1778 discard_and_relse:
1779         sk_drops_add(sk, skb);
1780         if (refcounted)
1781                 sock_put(sk);
1782         goto discard_it;
1783
1784 do_time_wait:
1785         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1786                 inet_twsk_put(inet_twsk(sk));
1787                 goto discard_it;
1788         }
1789
1790         tcp_v4_fill_cb(skb, iph, th);
1791
1792         if (tcp_checksum_complete(skb)) {
1793                 inet_twsk_put(inet_twsk(sk));
1794                 goto csum_error;
1795         }
1796         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1797         case TCP_TW_SYN: {
1798                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1799                                                         &tcp_hashinfo, skb,
1800                                                         __tcp_hdrlen(th),
1801                                                         iph->saddr, th->source,
1802                                                         iph->daddr, th->dest,
1803                                                         inet_iif(skb),
1804                                                         sdif);
1805                 if (sk2) {
1806                         inet_twsk_deschedule_put(inet_twsk(sk));
1807                         sk = sk2;
1808                         tcp_v4_restore_cb(skb);
1809                         refcounted = false;
1810                         goto process;
1811                 }
1812         }
1813                 /* to ACK */
1814                 /* fall through */
1815         case TCP_TW_ACK:
1816                 tcp_v4_timewait_ack(sk, skb);
1817                 break;
1818         case TCP_TW_RST:
1819                 tcp_v4_send_reset(sk, skb);
1820                 inet_twsk_deschedule_put(inet_twsk(sk));
1821                 goto discard_it;
1822         case TCP_TW_SUCCESS:;
1823         }
1824         goto discard_it;
1825 }
1826
1827 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1828         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1829         .twsk_unique    = tcp_twsk_unique,
1830         .twsk_destructor= tcp_twsk_destructor,
1831 };
1832
1833 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1834 {
1835         struct dst_entry *dst = skb_dst(skb);
1836
1837         if (dst && dst_hold_safe(dst)) {
1838                 sk->sk_rx_dst = dst;
1839                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1840         }
1841 }
1842 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1843
1844 const struct inet_connection_sock_af_ops ipv4_specific = {
1845         .queue_xmit        = ip_queue_xmit,
1846         .send_check        = tcp_v4_send_check,
1847         .rebuild_header    = inet_sk_rebuild_header,
1848         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1849         .conn_request      = tcp_v4_conn_request,
1850         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1851         .net_header_len    = sizeof(struct iphdr),
1852         .setsockopt        = ip_setsockopt,
1853         .getsockopt        = ip_getsockopt,
1854         .addr2sockaddr     = inet_csk_addr2sockaddr,
1855         .sockaddr_len      = sizeof(struct sockaddr_in),
1856 #ifdef CONFIG_COMPAT
1857         .compat_setsockopt = compat_ip_setsockopt,
1858         .compat_getsockopt = compat_ip_getsockopt,
1859 #endif
1860         .mtu_reduced       = tcp_v4_mtu_reduced,
1861 };
1862 EXPORT_SYMBOL(ipv4_specific);
1863
1864 #ifdef CONFIG_TCP_MD5SIG
1865 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1866         .md5_lookup             = tcp_v4_md5_lookup,
1867         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1868         .md5_parse              = tcp_v4_parse_md5_keys,
1869 };
1870 #endif
1871
1872 /* NOTE: A lot of things set to zero explicitly by call to
1873  *       sk_alloc() so need not be done here.
1874  */
1875 static int tcp_v4_init_sock(struct sock *sk)
1876 {
1877         struct inet_connection_sock *icsk = inet_csk(sk);
1878
1879         tcp_init_sock(sk);
1880
1881         icsk->icsk_af_ops = &ipv4_specific;
1882
1883 #ifdef CONFIG_TCP_MD5SIG
1884         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1885 #endif
1886
1887         return 0;
1888 }
1889
1890 void tcp_v4_destroy_sock(struct sock *sk)
1891 {
1892         struct tcp_sock *tp = tcp_sk(sk);
1893
1894         trace_tcp_destroy_sock(sk);
1895
1896         tcp_clear_xmit_timers(sk);
1897
1898         tcp_cleanup_congestion_control(sk);
1899
1900         tcp_cleanup_ulp(sk);
1901
1902         /* Cleanup up the write buffer. */
1903         tcp_write_queue_purge(sk);
1904
1905         /* Check if we want to disable active TFO */
1906         tcp_fastopen_active_disable_ofo_check(sk);
1907
1908         /* Cleans up our, hopefully empty, out_of_order_queue. */
1909         skb_rbtree_purge(&tp->out_of_order_queue);
1910
1911 #ifdef CONFIG_TCP_MD5SIG
1912         /* Clean up the MD5 key list, if any */
1913         if (tp->md5sig_info) {
1914                 tcp_clear_md5_list(sk);
1915                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1916                 tp->md5sig_info = NULL;
1917         }
1918 #endif
1919
1920         /* Clean up a referenced TCP bind bucket. */
1921         if (inet_csk(sk)->icsk_bind_hash)
1922                 inet_put_port(sk);
1923
1924         BUG_ON(tp->fastopen_rsk);
1925
1926         /* If socket is aborted during connect operation */
1927         tcp_free_fastopen_req(tp);
1928         tcp_fastopen_destroy_cipher(sk);
1929         tcp_saved_syn_free(tp);
1930
1931         sk_sockets_allocated_dec(sk);
1932 }
1933 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1934
1935 #ifdef CONFIG_PROC_FS
1936 /* Proc filesystem TCP sock list dumping. */
1937
1938 /*
1939  * Get next listener socket follow cur.  If cur is NULL, get first socket
1940  * starting from bucket given in st->bucket; when st->bucket is zero the
1941  * very first socket in the hash table is returned.
1942  */
1943 static void *listening_get_next(struct seq_file *seq, void *cur)
1944 {
1945         struct tcp_iter_state *st = seq->private;
1946         struct net *net = seq_file_net(seq);
1947         struct inet_listen_hashbucket *ilb;
1948         struct sock *sk = cur;
1949
1950         if (!sk) {
1951 get_head:
1952                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1953                 spin_lock(&ilb->lock);
1954                 sk = sk_head(&ilb->head);
1955                 st->offset = 0;
1956                 goto get_sk;
1957         }
1958         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1959         ++st->num;
1960         ++st->offset;
1961
1962         sk = sk_next(sk);
1963 get_sk:
1964         sk_for_each_from(sk) {
1965                 if (!net_eq(sock_net(sk), net))
1966                         continue;
1967                 if (sk->sk_family == st->family)
1968                         return sk;
1969         }
1970         spin_unlock(&ilb->lock);
1971         st->offset = 0;
1972         if (++st->bucket < INET_LHTABLE_SIZE)
1973                 goto get_head;
1974         return NULL;
1975 }
1976
1977 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1978 {
1979         struct tcp_iter_state *st = seq->private;
1980         void *rc;
1981
1982         st->bucket = 0;
1983         st->offset = 0;
1984         rc = listening_get_next(seq, NULL);
1985
1986         while (rc && *pos) {
1987                 rc = listening_get_next(seq, rc);
1988                 --*pos;
1989         }
1990         return rc;
1991 }
1992
1993 static inline bool empty_bucket(const struct tcp_iter_state *st)
1994 {
1995         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1996 }
1997
1998 /*
1999  * Get first established socket starting from bucket given in st->bucket.
2000  * If st->bucket is zero, the very first socket in the hash is returned.
2001  */
2002 static void *established_get_first(struct seq_file *seq)
2003 {
2004         struct tcp_iter_state *st = seq->private;
2005         struct net *net = seq_file_net(seq);
2006         void *rc = NULL;
2007
2008         st->offset = 0;
2009         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2010                 struct sock *sk;
2011                 struct hlist_nulls_node *node;
2012                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2013
2014                 /* Lockless fast path for the common case of empty buckets */
2015                 if (empty_bucket(st))
2016                         continue;
2017
2018                 spin_lock_bh(lock);
2019                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2020                         if (sk->sk_family != st->family ||
2021                             !net_eq(sock_net(sk), net)) {
2022                                 continue;
2023                         }
2024                         rc = sk;
2025                         goto out;
2026                 }
2027                 spin_unlock_bh(lock);
2028         }
2029 out:
2030         return rc;
2031 }
2032
2033 static void *established_get_next(struct seq_file *seq, void *cur)
2034 {
2035         struct sock *sk = cur;
2036         struct hlist_nulls_node *node;
2037         struct tcp_iter_state *st = seq->private;
2038         struct net *net = seq_file_net(seq);
2039
2040         ++st->num;
2041         ++st->offset;
2042
2043         sk = sk_nulls_next(sk);
2044
2045         sk_nulls_for_each_from(sk, node) {
2046                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2047                         return sk;
2048         }
2049
2050         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2051         ++st->bucket;
2052         return established_get_first(seq);
2053 }
2054
2055 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2056 {
2057         struct tcp_iter_state *st = seq->private;
2058         void *rc;
2059
2060         st->bucket = 0;
2061         rc = established_get_first(seq);
2062
2063         while (rc && pos) {
2064                 rc = established_get_next(seq, rc);
2065                 --pos;
2066         }
2067         return rc;
2068 }
2069
2070 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2071 {
2072         void *rc;
2073         struct tcp_iter_state *st = seq->private;
2074
2075         st->state = TCP_SEQ_STATE_LISTENING;
2076         rc        = listening_get_idx(seq, &pos);
2077
2078         if (!rc) {
2079                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2080                 rc        = established_get_idx(seq, pos);
2081         }
2082
2083         return rc;
2084 }
2085
2086 static void *tcp_seek_last_pos(struct seq_file *seq)
2087 {
2088         struct tcp_iter_state *st = seq->private;
2089         int offset = st->offset;
2090         int orig_num = st->num;
2091         void *rc = NULL;
2092
2093         switch (st->state) {
2094         case TCP_SEQ_STATE_LISTENING:
2095                 if (st->bucket >= INET_LHTABLE_SIZE)
2096                         break;
2097                 st->state = TCP_SEQ_STATE_LISTENING;
2098                 rc = listening_get_next(seq, NULL);
2099                 while (offset-- && rc)
2100                         rc = listening_get_next(seq, rc);
2101                 if (rc)
2102                         break;
2103                 st->bucket = 0;
2104                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2105                 /* Fallthrough */
2106         case TCP_SEQ_STATE_ESTABLISHED:
2107                 if (st->bucket > tcp_hashinfo.ehash_mask)
2108                         break;
2109                 rc = established_get_first(seq);
2110                 while (offset-- && rc)
2111                         rc = established_get_next(seq, rc);
2112         }
2113
2114         st->num = orig_num;
2115
2116         return rc;
2117 }
2118
2119 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2120 {
2121         struct tcp_iter_state *st = seq->private;
2122         void *rc;
2123
2124         if (*pos && *pos == st->last_pos) {
2125                 rc = tcp_seek_last_pos(seq);
2126                 if (rc)
2127                         goto out;
2128         }
2129
2130         st->state = TCP_SEQ_STATE_LISTENING;
2131         st->num = 0;
2132         st->bucket = 0;
2133         st->offset = 0;
2134         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2135
2136 out:
2137         st->last_pos = *pos;
2138         return rc;
2139 }
2140
2141 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2142 {
2143         struct tcp_iter_state *st = seq->private;
2144         void *rc = NULL;
2145
2146         if (v == SEQ_START_TOKEN) {
2147                 rc = tcp_get_idx(seq, 0);
2148                 goto out;
2149         }
2150
2151         switch (st->state) {
2152         case TCP_SEQ_STATE_LISTENING:
2153                 rc = listening_get_next(seq, v);
2154                 if (!rc) {
2155                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2156                         st->bucket = 0;
2157                         st->offset = 0;
2158                         rc        = established_get_first(seq);
2159                 }
2160                 break;
2161         case TCP_SEQ_STATE_ESTABLISHED:
2162                 rc = established_get_next(seq, v);
2163                 break;
2164         }
2165 out:
2166         ++*pos;
2167         st->last_pos = *pos;
2168         return rc;
2169 }
2170
2171 static void tcp_seq_stop(struct seq_file *seq, void *v)
2172 {
2173         struct tcp_iter_state *st = seq->private;
2174
2175         switch (st->state) {
2176         case TCP_SEQ_STATE_LISTENING:
2177                 if (v != SEQ_START_TOKEN)
2178                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2179                 break;
2180         case TCP_SEQ_STATE_ESTABLISHED:
2181                 if (v)
2182                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2183                 break;
2184         }
2185 }
2186
2187 int tcp_seq_open(struct inode *inode, struct file *file)
2188 {
2189         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2190         struct tcp_iter_state *s;
2191         int err;
2192
2193         err = seq_open_net(inode, file, &afinfo->seq_ops,
2194                           sizeof(struct tcp_iter_state));
2195         if (err < 0)
2196                 return err;
2197
2198         s = ((struct seq_file *)file->private_data)->private;
2199         s->family               = afinfo->family;
2200         s->last_pos             = 0;
2201         return 0;
2202 }
2203 EXPORT_SYMBOL(tcp_seq_open);
2204
2205 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2206 {
2207         int rc = 0;
2208         struct proc_dir_entry *p;
2209
2210         afinfo->seq_ops.start           = tcp_seq_start;
2211         afinfo->seq_ops.next            = tcp_seq_next;
2212         afinfo->seq_ops.stop            = tcp_seq_stop;
2213
2214         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2215                              afinfo->seq_fops, afinfo);
2216         if (!p)
2217                 rc = -ENOMEM;
2218         return rc;
2219 }
2220 EXPORT_SYMBOL(tcp_proc_register);
2221
2222 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2223 {
2224         remove_proc_entry(afinfo->name, net->proc_net);
2225 }
2226 EXPORT_SYMBOL(tcp_proc_unregister);
2227
2228 static void get_openreq4(const struct request_sock *req,
2229                          struct seq_file *f, int i)
2230 {
2231         const struct inet_request_sock *ireq = inet_rsk(req);
2232         long delta = req->rsk_timer.expires - jiffies;
2233
2234         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2235                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2236                 i,
2237                 ireq->ir_loc_addr,
2238                 ireq->ir_num,
2239                 ireq->ir_rmt_addr,
2240                 ntohs(ireq->ir_rmt_port),
2241                 TCP_SYN_RECV,
2242                 0, 0, /* could print option size, but that is af dependent. */
2243                 1,    /* timers active (only the expire timer) */
2244                 jiffies_delta_to_clock_t(delta),
2245                 req->num_timeout,
2246                 from_kuid_munged(seq_user_ns(f),
2247                                  sock_i_uid(req->rsk_listener)),
2248                 0,  /* non standard timer */
2249                 0, /* open_requests have no inode */
2250                 0,
2251                 req);
2252 }
2253
2254 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2255 {
2256         int timer_active;
2257         unsigned long timer_expires;
2258         const struct tcp_sock *tp = tcp_sk(sk);
2259         const struct inet_connection_sock *icsk = inet_csk(sk);
2260         const struct inet_sock *inet = inet_sk(sk);
2261         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2262         __be32 dest = inet->inet_daddr;
2263         __be32 src = inet->inet_rcv_saddr;
2264         __u16 destp = ntohs(inet->inet_dport);
2265         __u16 srcp = ntohs(inet->inet_sport);
2266         int rx_queue;
2267         int state;
2268
2269         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2270             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2271             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2272                 timer_active    = 1;
2273                 timer_expires   = icsk->icsk_timeout;
2274         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2275                 timer_active    = 4;
2276                 timer_expires   = icsk->icsk_timeout;
2277         } else if (timer_pending(&sk->sk_timer)) {
2278                 timer_active    = 2;
2279                 timer_expires   = sk->sk_timer.expires;
2280         } else {
2281                 timer_active    = 0;
2282                 timer_expires = jiffies;
2283         }
2284
2285         state = inet_sk_state_load(sk);
2286         if (state == TCP_LISTEN)
2287                 rx_queue = sk->sk_ack_backlog;
2288         else
2289                 /* Because we don't lock the socket,
2290                  * we might find a transient negative value.
2291                  */
2292                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2293
2294         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2295                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2296                 i, src, srcp, dest, destp, state,
2297                 tp->write_seq - tp->snd_una,
2298                 rx_queue,
2299                 timer_active,
2300                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2301                 icsk->icsk_retransmits,
2302                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2303                 icsk->icsk_probes_out,
2304                 sock_i_ino(sk),
2305                 refcount_read(&sk->sk_refcnt), sk,
2306                 jiffies_to_clock_t(icsk->icsk_rto),
2307                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2308                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2309                 tp->snd_cwnd,
2310                 state == TCP_LISTEN ?
2311                     fastopenq->max_qlen :
2312                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2313 }
2314
2315 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2316                                struct seq_file *f, int i)
2317 {
2318         long delta = tw->tw_timer.expires - jiffies;
2319         __be32 dest, src;
2320         __u16 destp, srcp;
2321
2322         dest  = tw->tw_daddr;
2323         src   = tw->tw_rcv_saddr;
2324         destp = ntohs(tw->tw_dport);
2325         srcp  = ntohs(tw->tw_sport);
2326
2327         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2328                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2329                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2330                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2331                 refcount_read(&tw->tw_refcnt), tw);
2332 }
2333
2334 #define TMPSZ 150
2335
2336 static int tcp4_seq_show(struct seq_file *seq, void *v)
2337 {
2338         struct tcp_iter_state *st;
2339         struct sock *sk = v;
2340
2341         seq_setwidth(seq, TMPSZ - 1);
2342         if (v == SEQ_START_TOKEN) {
2343                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2344                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2345                            "inode");
2346                 goto out;
2347         }
2348         st = seq->private;
2349
2350         if (sk->sk_state == TCP_TIME_WAIT)
2351                 get_timewait4_sock(v, seq, st->num);
2352         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2353                 get_openreq4(v, seq, st->num);
2354         else
2355                 get_tcp4_sock(v, seq, st->num);
2356 out:
2357         seq_pad(seq, '\n');
2358         return 0;
2359 }
2360
2361 static const struct file_operations tcp_afinfo_seq_fops = {
2362         .open    = tcp_seq_open,
2363         .read    = seq_read,
2364         .llseek  = seq_lseek,
2365         .release = seq_release_net
2366 };
2367
2368 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2369         .name           = "tcp",
2370         .family         = AF_INET,
2371         .seq_fops       = &tcp_afinfo_seq_fops,
2372         .seq_ops        = {
2373                 .show           = tcp4_seq_show,
2374         },
2375 };
2376
2377 static int __net_init tcp4_proc_init_net(struct net *net)
2378 {
2379         return tcp_proc_register(net, &tcp4_seq_afinfo);
2380 }
2381
2382 static void __net_exit tcp4_proc_exit_net(struct net *net)
2383 {
2384         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2385 }
2386
2387 static struct pernet_operations tcp4_net_ops = {
2388         .init = tcp4_proc_init_net,
2389         .exit = tcp4_proc_exit_net,
2390         .async = true,
2391 };
2392
2393 int __init tcp4_proc_init(void)
2394 {
2395         return register_pernet_subsys(&tcp4_net_ops);
2396 }
2397
2398 void tcp4_proc_exit(void)
2399 {
2400         unregister_pernet_subsys(&tcp4_net_ops);
2401 }
2402 #endif /* CONFIG_PROC_FS */
2403
2404 struct proto tcp_prot = {
2405         .name                   = "TCP",
2406         .owner                  = THIS_MODULE,
2407         .close                  = tcp_close,
2408         .connect                = tcp_v4_connect,
2409         .disconnect             = tcp_disconnect,
2410         .accept                 = inet_csk_accept,
2411         .ioctl                  = tcp_ioctl,
2412         .init                   = tcp_v4_init_sock,
2413         .destroy                = tcp_v4_destroy_sock,
2414         .shutdown               = tcp_shutdown,
2415         .setsockopt             = tcp_setsockopt,
2416         .getsockopt             = tcp_getsockopt,
2417         .keepalive              = tcp_set_keepalive,
2418         .recvmsg                = tcp_recvmsg,
2419         .sendmsg                = tcp_sendmsg,
2420         .sendpage               = tcp_sendpage,
2421         .backlog_rcv            = tcp_v4_do_rcv,
2422         .release_cb             = tcp_release_cb,
2423         .hash                   = inet_hash,
2424         .unhash                 = inet_unhash,
2425         .get_port               = inet_csk_get_port,
2426         .enter_memory_pressure  = tcp_enter_memory_pressure,
2427         .leave_memory_pressure  = tcp_leave_memory_pressure,
2428         .stream_memory_free     = tcp_stream_memory_free,
2429         .sockets_allocated      = &tcp_sockets_allocated,
2430         .orphan_count           = &tcp_orphan_count,
2431         .memory_allocated       = &tcp_memory_allocated,
2432         .memory_pressure        = &tcp_memory_pressure,
2433         .sysctl_mem             = sysctl_tcp_mem,
2434         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2435         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2436         .max_header             = MAX_TCP_HEADER,
2437         .obj_size               = sizeof(struct tcp_sock),
2438         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2439         .twsk_prot              = &tcp_timewait_sock_ops,
2440         .rsk_prot               = &tcp_request_sock_ops,
2441         .h.hashinfo             = &tcp_hashinfo,
2442         .no_autobind            = true,
2443 #ifdef CONFIG_COMPAT
2444         .compat_setsockopt      = compat_tcp_setsockopt,
2445         .compat_getsockopt      = compat_tcp_getsockopt,
2446 #endif
2447         .diag_destroy           = tcp_abort,
2448 };
2449 EXPORT_SYMBOL(tcp_prot);
2450
2451 static void __net_exit tcp_sk_exit(struct net *net)
2452 {
2453         int cpu;
2454
2455         module_put(net->ipv4.tcp_congestion_control->owner);
2456
2457         for_each_possible_cpu(cpu)
2458                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2459         free_percpu(net->ipv4.tcp_sk);
2460 }
2461
2462 static int __net_init tcp_sk_init(struct net *net)
2463 {
2464         int res, cpu, cnt;
2465
2466         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2467         if (!net->ipv4.tcp_sk)
2468                 return -ENOMEM;
2469
2470         for_each_possible_cpu(cpu) {
2471                 struct sock *sk;
2472
2473                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2474                                            IPPROTO_TCP, net);
2475                 if (res)
2476                         goto fail;
2477                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2478                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2479         }
2480
2481         net->ipv4.sysctl_tcp_ecn = 2;
2482         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2483
2484         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2485         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2486         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2487
2488         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2489         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2490         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2491
2492         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2493         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2494         net->ipv4.sysctl_tcp_syncookies = 1;
2495         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2496         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2497         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2498         net->ipv4.sysctl_tcp_orphan_retries = 0;
2499         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2500         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2501         net->ipv4.sysctl_tcp_tw_reuse = 0;
2502
2503         cnt = tcp_hashinfo.ehash_mask + 1;
2504         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2505         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2506
2507         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2508         net->ipv4.sysctl_tcp_sack = 1;
2509         net->ipv4.sysctl_tcp_window_scaling = 1;
2510         net->ipv4.sysctl_tcp_timestamps = 1;
2511         net->ipv4.sysctl_tcp_early_retrans = 3;
2512         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2513         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2514         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2515         net->ipv4.sysctl_tcp_max_reordering = 300;
2516         net->ipv4.sysctl_tcp_dsack = 1;
2517         net->ipv4.sysctl_tcp_app_win = 31;
2518         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2519         net->ipv4.sysctl_tcp_frto = 2;
2520         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2521         /* This limits the percentage of the congestion window which we
2522          * will allow a single TSO frame to consume.  Building TSO frames
2523          * which are too large can cause TCP streams to be bursty.
2524          */
2525         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2526         /* Default TSQ limit of four TSO segments */
2527         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2528         /* rfc5961 challenge ack rate limiting */
2529         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2530         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2531         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2532         net->ipv4.sysctl_tcp_autocorking = 1;
2533         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2534         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2535         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2536         if (net != &init_net) {
2537                 memcpy(net->ipv4.sysctl_tcp_rmem,
2538                        init_net.ipv4.sysctl_tcp_rmem,
2539                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2540                 memcpy(net->ipv4.sysctl_tcp_wmem,
2541                        init_net.ipv4.sysctl_tcp_wmem,
2542                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2543         }
2544         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2545         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2546         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2547         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2548
2549         /* Reno is always built in */
2550         if (!net_eq(net, &init_net) &&
2551             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2552                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2553         else
2554                 net->ipv4.tcp_congestion_control = &tcp_reno;
2555
2556         return 0;
2557 fail:
2558         tcp_sk_exit(net);
2559
2560         return res;
2561 }
2562
2563 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2564 {
2565         struct net *net;
2566
2567         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2568
2569         list_for_each_entry(net, net_exit_list, exit_list)
2570                 tcp_fastopen_ctx_destroy(net);
2571 }
2572
2573 static struct pernet_operations __net_initdata tcp_sk_ops = {
2574        .init       = tcp_sk_init,
2575        .exit       = tcp_sk_exit,
2576        .exit_batch = tcp_sk_exit_batch,
2577        .async      = true,
2578 };
2579
2580 void __init tcp_v4_init(void)
2581 {
2582         if (register_pernet_subsys(&tcp_sk_ops))
2583                 panic("Failed to create the TCP control socket.\n");
2584 }