net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79
  80 #include <crypto/hash.h>
  81 #include <linux/scatterlist.h>
  82
  83 #include <trace/events/tcp.h>
  84
  85 #ifdef CONFIG_TCP_MD5SIG
  86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88 #endif
  89
  90 struct inet_hashinfo tcp_hashinfo;
  91 EXPORT_SYMBOL(tcp_hashinfo);
  92
  93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94 {
  95         return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                               ip_hdr(skb)->saddr,
  97                               tcp_hdr(skb)->dest,
  98                               tcp_hdr(skb)->source);
  99 }
 100
 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102 {
 103         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113         if (reuse == 2) {
 114                 /* Still does not detect *everything* that goes through
 115                  * lo, since we require a loopback src or dst address
 116                  * or direct binding to 'lo' interface.
 117                  */
 118                 bool loopback = false;
 119                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                         loopback = true;
 121 #if IS_ENABLED(CONFIG_IPV6)
 122                 if (tw->tw_family == AF_INET6) {
 123                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 126                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 127                                 loopback = true;
 128                 } else
 129 #endif
 130                 {
 131                         if (ipv4_is_loopback(tw->tw_daddr) ||
 132                             ipv4_is_loopback(tw->tw_rcv_saddr))
 133                                 loopback = true;
 134                 }
 135                 if (!loopback)
 136                         reuse = 0;
 137         }
 138
 139         /* With PAWS, it is safe from the viewpoint
 140            of data integrity. Even without PAWS it is safe provided sequence
 141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 142
 143            Actually, the idea is close to VJ's one, only timestamp cache is
 144            held not per host, but per port pair and TW bucket is used as state
 145            holder.
 146
 147            If TW bucket has been already destroyed we fall back to VJ's scheme
 148            and use initial timestamp retrieved from peer table.
 149          */
 150         if (tcptw->tw_ts_recent_stamp &&
 151             (!twp || (reuse && time_after32(ktime_get_seconds(),
 152                                             tcptw->tw_ts_recent_stamp)))) {
 153                 /* In case of repair and re-using TIME-WAIT sockets we still
 154                  * want to be sure that it is safe as above but honor the
 155                  * sequence numbers and time stamps set as part of the repair
 156                  * process.
 157                  *
 158                  * Without this check re-using a TIME-WAIT socket with TCP
 159                  * repair would accumulate a -1 on the repair assigned
 160                  * sequence number. The first time it is reused the sequence
 161                  * is -1, the second time -2, etc. This fixes that issue
 162                  * without appearing to create any others.
 163                  */
 164                 if (likely(!tp->repair)) {
 165                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 166
 167                         if (!seq)
 168                                 seq = 1;
 169                         WRITE_ONCE(tp->write_seq, seq);
 170                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                 }
 173                 sock_hold(sktw);
 174                 return 1;
 175         }
 176
 177         return 0;
 178 }
 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                               int addr_len)
 183 {
 184         /* This check is replicated from tcp_v4_connect() and intended to
 185          * prevent BPF program called below from accessing bytes that are out
 186          * of the bound specified by user in addr_len.
 187          */
 188         if (addr_len < sizeof(struct sockaddr_in))
 189                 return -EINVAL;
 190
 191         sock_owned_by_me(sk);
 192
 193         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194 }
 195
 196 /* This will initiate an outgoing connection. */
 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198 {
 199         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200         struct inet_sock *inet = inet_sk(sk);
 201         struct tcp_sock *tp = tcp_sk(sk);
 202         __be16 orig_sport, orig_dport;
 203         __be32 daddr, nexthop;
 204         struct flowi4 *fl4;
 205         struct rtable *rt;
 206         int err;
 207         struct ip_options_rcu *inet_opt;
 208         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210         if (addr_len < sizeof(struct sockaddr_in))
 211                 return -EINVAL;
 212
 213         if (usin->sin_family != AF_INET)
 214                 return -EAFNOSUPPORT;
 215
 216         nexthop = daddr = usin->sin_addr.s_addr;
 217         inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                              lockdep_sock_is_held(sk));
 219         if (inet_opt && inet_opt->opt.srr) {
 220                 if (!daddr)
 221                         return -EINVAL;
 222                 nexthop = inet_opt->opt.faddr;
 223         }
 224
 225         orig_sport = inet->inet_sport;
 226         orig_dport = usin->sin_port;
 227         fl4 = &inet->cork.fl.u.ip4;
 228         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                               IPPROTO_TCP,
 231                               orig_sport, orig_dport, sk);
 232         if (IS_ERR(rt)) {
 233                 err = PTR_ERR(rt);
 234                 if (err == -ENETUNREACH)
 235                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                 return err;
 237         }
 238
 239         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                 ip_rt_put(rt);
 241                 return -ENETUNREACH;
 242         }
 243
 244         if (!inet_opt || !inet_opt->opt.srr)
 245                 daddr = fl4->daddr;
 246
 247         if (!inet->inet_saddr)
 248                 inet->inet_saddr = fl4->saddr;
 249         sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                 /* Reset inherited state */
 253                 tp->rx_opt.ts_recent       = 0;
 254                 tp->rx_opt.ts_recent_stamp = 0;
 255                 if (likely(!tp->repair))
 256                         WRITE_ONCE(tp->write_seq, 0);
 257         }
 258
 259         inet->inet_dport = usin->sin_port;
 260         sk_daddr_set(sk, daddr);
 261
 262         inet_csk(sk)->icsk_ext_hdr_len = 0;
 263         if (inet_opt)
 264                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268         /* Socket identity is still unknown (sport may be zero).
 269          * However we set state to SYN-SENT and not releasing socket
 270          * lock select source port, enter ourselves into the hash tables and
 271          * complete initialization after this.
 272          */
 273         tcp_set_state(sk, TCP_SYN_SENT);
 274         err = inet_hash_connect(tcp_death_row, sk);
 275         if (err)
 276                 goto failure;
 277
 278         sk_set_txhash(sk);
 279
 280         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                                inet->inet_sport, inet->inet_dport, sk);
 282         if (IS_ERR(rt)) {
 283                 err = PTR_ERR(rt);
 284                 rt = NULL;
 285                 goto failure;
 286         }
 287         /* OK, now commit destination to socket.  */
 288         sk->sk_gso_type = SKB_GSO_TCPV4;
 289         sk_setup_caps(sk, &rt->dst);
 290         rt = NULL;
 291
 292         if (likely(!tp->repair)) {
 293                 if (!tp->write_seq)
 294                         WRITE_ONCE(tp->write_seq,
 295                                    secure_tcp_seq(inet->inet_saddr,
 296                                                   inet->inet_daddr,
 297                                                   inet->inet_sport,
 298                                                   usin->sin_port));
 299                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 300                                                  inet->inet_saddr,
 301                                                  inet->inet_daddr);
 302         }
 303
 304         inet->inet_id = prandom_u32();
 305
 306         if (tcp_fastopen_defer_connect(sk, &err))
 307                 return err;
 308         if (err)
 309                 goto failure;
 310
 311         err = tcp_connect(sk);
 312
 313         if (err)
 314                 goto failure;
 315
 316         return 0;
 317
 318 failure:
 319         /*
 320          * This unhashes the socket and releases the local port,
 321          * if necessary.
 322          */
 323         tcp_set_state(sk, TCP_CLOSE);
 324         ip_rt_put(rt);
 325         sk->sk_route_caps = 0;
 326         inet->inet_dport = 0;
 327         return err;
 328 }
 329 EXPORT_SYMBOL(tcp_v4_connect);
 330
 331 /*
 332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 333  * It can be called through tcp_release_cb() if socket was owned by user
 334  * at the time tcp_v4_err() was called to handle ICMP message.
 335  */
 336 void tcp_v4_mtu_reduced(struct sock *sk)
 337 {
 338         struct inet_sock *inet = inet_sk(sk);
 339         struct dst_entry *dst;
 340         u32 mtu;
 341
 342         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 343                 return;
 344         mtu = tcp_sk(sk)->mtu_info;
 345         dst = inet_csk_update_pmtu(sk, mtu);
 346         if (!dst)
 347                 return;
 348
 349         /* Something is about to be wrong... Remember soft error
 350          * for the case, if this connection will not able to recover.
 351          */
 352         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 353                 sk->sk_err_soft = EMSGSIZE;
 354
 355         mtu = dst_mtu(dst);
 356
 357         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 358             ip_sk_accept_pmtu(sk) &&
 359             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 360                 tcp_sync_mss(sk, mtu);
 361
 362                 /* Resend the TCP packet because it's
 363                  * clear that the old packet has been
 364                  * dropped. This is the new "fast" path mtu
 365                  * discovery.
 366                  */
 367                 tcp_simple_retransmit(sk);
 368         } /* else let the usual retransmit timer handle it */
 369 }
 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 371
 372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 373 {
 374         struct dst_entry *dst = __sk_dst_check(sk, 0);
 375
 376         if (dst)
 377                 dst->ops->redirect(dst, sk, skb);
 378 }
 379
 380
 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 383 {
 384         struct request_sock *req = inet_reqsk(sk);
 385         struct net *net = sock_net(sk);
 386
 387         /* ICMPs are not backlogged, hence we cannot get
 388          * an established socket here.
 389          */
 390         if (seq != tcp_rsk(req)->snt_isn) {
 391                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 392         } else if (abort) {
 393                 /*
 394                  * Still in SYN_RECV, just remove it silently.
 395                  * There is no good way to pass the error to the newly
 396                  * created socket, and POSIX does not want network
 397                  * errors returned from accept().
 398                  */
 399                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 400                 tcp_listendrop(req->rsk_listener);
 401         }
 402         reqsk_put(req);
 403 }
 404 EXPORT_SYMBOL(tcp_req_err);
 405
 406 /*
 407  * This routine is called by the ICMP module when it gets some
 408  * sort of error condition.  If err < 0 then the socket should
 409  * be closed and the error returned to the user.  If err > 0
 410  * it's just the icmp type << 8 | icmp code.  After adjustment
 411  * header points to the first 8 bytes of the tcp header.  We need
 412  * to find the appropriate port.
 413  *
 414  * The locking strategy used here is very "optimistic". When
 415  * someone else accesses the socket the ICMP is just dropped
 416  * and for some paths there is no check at all.
 417  * A more general error queue to queue errors for later handling
 418  * is probably better.
 419  *
 420  */
 421
 422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 423 {
 424         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 425         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 426         struct inet_connection_sock *icsk;
 427         struct tcp_sock *tp;
 428         struct inet_sock *inet;
 429         const int type = icmp_hdr(icmp_skb)->type;
 430         const int code = icmp_hdr(icmp_skb)->code;
 431         struct sock *sk;
 432         struct sk_buff *skb;
 433         struct request_sock *fastopen;
 434         u32 seq, snd_una;
 435         s32 remaining;
 436         u32 delta_us;
 437         int err;
 438         struct net *net = dev_net(icmp_skb->dev);
 439
 440         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 441                                        th->dest, iph->saddr, ntohs(th->source),
 442                                        inet_iif(icmp_skb), 0);
 443         if (!sk) {
 444                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 445                 return -ENOENT;
 446         }
 447         if (sk->sk_state == TCP_TIME_WAIT) {
 448                 inet_twsk_put(inet_twsk(sk));
 449                 return 0;
 450         }
 451         seq = ntohl(th->seq);
 452         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 453                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 454                                      type == ICMP_TIME_EXCEEDED ||
 455                                      (type == ICMP_DEST_UNREACH &&
 456                                       (code == ICMP_NET_UNREACH ||
 457                                        code == ICMP_HOST_UNREACH)));
 458                 return 0;
 459         }
 460
 461         bh_lock_sock(sk);
 462         /* If too many ICMPs get dropped on busy
 463          * servers this needs to be solved differently.
 464          * We do take care of PMTU discovery (RFC1191) special case :
 465          * we can receive locally generated ICMP messages while socket is held.
 466          */
 467         if (sock_owned_by_user(sk)) {
 468                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 469                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 470         }
 471         if (sk->sk_state == TCP_CLOSE)
 472                 goto out;
 473
 474         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 475                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 476                 goto out;
 477         }
 478
 479         icsk = inet_csk(sk);
 480         tp = tcp_sk(sk);
 481         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 482         fastopen = rcu_dereference(tp->fastopen_rsk);
 483         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 484         if (sk->sk_state != TCP_LISTEN &&
 485             !between(seq, snd_una, tp->snd_nxt)) {
 486                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 487                 goto out;
 488         }
 489
 490         switch (type) {
 491         case ICMP_REDIRECT:
 492                 if (!sock_owned_by_user(sk))
 493                         do_redirect(icmp_skb, sk);
 494                 goto out;
 495         case ICMP_SOURCE_QUENCH:
 496                 /* Just silently ignore these. */
 497                 goto out;
 498         case ICMP_PARAMETERPROB:
 499                 err = EPROTO;
 500                 break;
 501         case ICMP_DEST_UNREACH:
 502                 if (code > NR_ICMP_UNREACH)
 503                         goto out;
 504
 505                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 506                         /* We are not interested in TCP_LISTEN and open_requests
 507                          * (SYN-ACKs send out by Linux are always <576bytes so
 508                          * they should go through unfragmented).
 509                          */
 510                         if (sk->sk_state == TCP_LISTEN)
 511                                 goto out;
 512
 513                         tp->mtu_info = info;
 514                         if (!sock_owned_by_user(sk)) {
 515                                 tcp_v4_mtu_reduced(sk);
 516                         } else {
 517                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 518                                         sock_hold(sk);
 519                         }
 520                         goto out;
 521                 }
 522
 523                 err = icmp_err_convert[code].errno;
 524                 /* check if icmp_skb allows revert of backoff
 525                  * (see draft-zimmermann-tcp-lcd) */
 526                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 527                         break;
 528                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 529                     !icsk->icsk_backoff || fastopen)
 530                         break;
 531
 532                 if (sock_owned_by_user(sk))
 533                         break;
 534
 535                 skb = tcp_rtx_queue_head(sk);
 536                 if (WARN_ON_ONCE(!skb))
 537                         break;
 538
 539                 icsk->icsk_backoff--;
 540                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 541                                                TCP_TIMEOUT_INIT;
 542                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 543
 544
 545                 tcp_mstamp_refresh(tp);
 546                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 547                 remaining = icsk->icsk_rto -
 548                             usecs_to_jiffies(delta_us);
 549
 550                 if (remaining > 0) {
 551                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 552                                                   remaining, TCP_RTO_MAX);
 553                 } else {
 554                         /* RTO revert clocked out retransmission.
 555                          * Will retransmit now */
 556                         tcp_retransmit_timer(sk);
 557                 }
 558
 559                 break;
 560         case ICMP_TIME_EXCEEDED:
 561                 err = EHOSTUNREACH;
 562                 break;
 563         default:
 564                 goto out;
 565         }
 566
 567         switch (sk->sk_state) {
 568         case TCP_SYN_SENT:
 569         case TCP_SYN_RECV:
 570                 /* Only in fast or simultaneous open. If a fast open socket is
 571                  * is already accepted it is treated as a connected one below.
 572                  */
 573                 if (fastopen && !fastopen->sk)
 574                         break;
 575
 576                 if (!sock_owned_by_user(sk)) {
 577                         sk->sk_err = err;
 578
 579                         sk->sk_error_report(sk);
 580
 581                         tcp_done(sk);
 582                 } else {
 583                         sk->sk_err_soft = err;
 584                 }
 585                 goto out;
 586         }
 587
 588         /* If we've already connected we will keep trying
 589          * until we time out, or the user gives up.
 590          *
 591          * rfc1122 4.2.3.9 allows to consider as hard errors
 592          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 593          * but it is obsoleted by pmtu discovery).
 594          *
 595          * Note, that in modern internet, where routing is unreliable
 596          * and in each dark corner broken firewalls sit, sending random
 597          * errors ordered by their masters even this two messages finally lose
 598          * their original sense (even Linux sends invalid PORT_UNREACHs)
 599          *
 600          * Now we are in compliance with RFCs.
 601          *                                                      --ANK (980905)
 602          */
 603
 604         inet = inet_sk(sk);
 605         if (!sock_owned_by_user(sk) && inet->recverr) {
 606                 sk->sk_err = err;
 607                 sk->sk_error_report(sk);
 608         } else  { /* Only an error on timeout */
 609                 sk->sk_err_soft = err;
 610         }
 611
 612 out:
 613         bh_unlock_sock(sk);
 614         sock_put(sk);
 615         return 0;
 616 }
 617
 618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 619 {
 620         struct tcphdr *th = tcp_hdr(skb);
 621
 622         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 623         skb->csum_start = skb_transport_header(skb) - skb->head;
 624         skb->csum_offset = offsetof(struct tcphdr, check);
 625 }
 626
 627 /* This routine computes an IPv4 TCP checksum. */
 628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 629 {
 630         const struct inet_sock *inet = inet_sk(sk);
 631
 632         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 633 }
 634 EXPORT_SYMBOL(tcp_v4_send_check);
 635
 636 /*
 637  *      This routine will send an RST to the other tcp.
 638  *
 639  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 640  *                    for reset.
 641  *      Answer: if a packet caused RST, it is not for a socket
 642  *              existing in our system, if it is matched to a socket,
 643  *              it is just duplicate segment or bug in other side's TCP.
 644  *              So that we build reply only basing on parameters
 645  *              arrived with segment.
 646  *      Exception: precedence violation. We do not implement it in any case.
 647  */
 648
 649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 650 {
 651         const struct tcphdr *th = tcp_hdr(skb);
 652         struct {
 653                 struct tcphdr th;
 654 #ifdef CONFIG_TCP_MD5SIG
 655                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 656 #endif
 657         } rep;
 658         struct ip_reply_arg arg;
 659 #ifdef CONFIG_TCP_MD5SIG
 660         struct tcp_md5sig_key *key = NULL;
 661         const __u8 *hash_location = NULL;
 662         unsigned char newhash[16];
 663         int genhash;
 664         struct sock *sk1 = NULL;
 665 #endif
 666         u64 transmit_time = 0;
 667         struct sock *ctl_sk;
 668         struct net *net;
 669
 670         /* Never send a reset in response to a reset. */
 671         if (th->rst)
 672                 return;
 673
 674         /* If sk not NULL, it means we did a successful lookup and incoming
 675          * route had to be correct. prequeue might have dropped our dst.
 676          */
 677         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 678                 return;
 679
 680         /* Swap the send and the receive. */
 681         memset(&rep, 0, sizeof(rep));
 682         rep.th.dest   = th->source;
 683         rep.th.source = th->dest;
 684         rep.th.doff   = sizeof(struct tcphdr) / 4;
 685         rep.th.rst    = 1;
 686
 687         if (th->ack) {
 688                 rep.th.seq = th->ack_seq;
 689         } else {
 690                 rep.th.ack = 1;
 691                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 692                                        skb->len - (th->doff << 2));
 693         }
 694
 695         memset(&arg, 0, sizeof(arg));
 696         arg.iov[0].iov_base = (unsigned char *)&rep;
 697         arg.iov[0].iov_len  = sizeof(rep.th);
 698
 699         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 700 #ifdef CONFIG_TCP_MD5SIG
 701         rcu_read_lock();
 702         hash_location = tcp_parse_md5sig_option(th);
 703         if (sk && sk_fullsock(sk)) {
 704                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 705                                         &ip_hdr(skb)->saddr, AF_INET);
 706         } else if (hash_location) {
 707                 /*
 708                  * active side is lost. Try to find listening socket through
 709                  * source port, and then find md5 key through listening socket.
 710                  * we are not loose security here:
 711                  * Incoming packet is checked with md5 hash with finding key,
 712                  * no RST generated if md5 hash doesn't match.
 713                  */
 714                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 715                                              ip_hdr(skb)->saddr,
 716                                              th->source, ip_hdr(skb)->daddr,
 717                                              ntohs(th->source), inet_iif(skb),
 718                                              tcp_v4_sdif(skb));
 719                 /* don't send rst if it can't find key */
 720                 if (!sk1)
 721                         goto out;
 722
 723                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 724                                         &ip_hdr(skb)->saddr, AF_INET);
 725                 if (!key)
 726                         goto out;
 727
 728
 729                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 730                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 731                         goto out;
 732
 733         }
 734
 735         if (key) {
 736                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 737                                    (TCPOPT_NOP << 16) |
 738                                    (TCPOPT_MD5SIG << 8) |
 739                                    TCPOLEN_MD5SIG);
 740                 /* Update length and the length the header thinks exists */
 741                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 742                 rep.th.doff = arg.iov[0].iov_len / 4;
 743
 744                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 745                                      key, ip_hdr(skb)->saddr,
 746                                      ip_hdr(skb)->daddr, &rep.th);
 747         }
 748 #endif
 749         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 750                                       ip_hdr(skb)->saddr, /* XXX */
 751                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 752         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 753         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 754
 755         /* When socket is gone, all binding information is lost.
 756          * routing might fail in this case. No choice here, if we choose to force
 757          * input interface, we will misroute in case of asymmetric route.
 758          */
 759         if (sk) {
 760                 arg.bound_dev_if = sk->sk_bound_dev_if;
 761                 if (sk_fullsock(sk))
 762                         trace_tcp_send_reset(sk, skb);
 763         }
 764
 765         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 766                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 767
 768         arg.tos = ip_hdr(skb)->tos;
 769         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 770         local_bh_disable();
 771         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 772         if (sk) {
 773                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 774                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 775                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 776                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 777                 transmit_time = tcp_transmit_time(sk);
 778         }
 779         ip_send_unicast_reply(ctl_sk,
 780                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 781                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 782                               &arg, arg.iov[0].iov_len,
 783                               transmit_time);
 784
 785         ctl_sk->sk_mark = 0;
 786         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 787         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 788         local_bh_enable();
 789
 790 #ifdef CONFIG_TCP_MD5SIG
 791 out:
 792         rcu_read_unlock();
 793 #endif
 794 }
 795
 796 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 797    outside socket context is ugly, certainly. What can I do?
 798  */
 799
 800 static void tcp_v4_send_ack(const struct sock *sk,
 801                             struct sk_buff *skb, u32 seq, u32 ack,
 802                             u32 win, u32 tsval, u32 tsecr, int oif,
 803                             struct tcp_md5sig_key *key,
 804                             int reply_flags, u8 tos)
 805 {
 806         const struct tcphdr *th = tcp_hdr(skb);
 807         struct {
 808                 struct tcphdr th;
 809                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 810 #ifdef CONFIG_TCP_MD5SIG
 811                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 812 #endif
 813                         ];
 814         } rep;
 815         struct net *net = sock_net(sk);
 816         struct ip_reply_arg arg;
 817         struct sock *ctl_sk;
 818         u64 transmit_time;
 819
 820         memset(&rep.th, 0, sizeof(struct tcphdr));
 821         memset(&arg, 0, sizeof(arg));
 822
 823         arg.iov[0].iov_base = (unsigned char *)&rep;
 824         arg.iov[0].iov_len  = sizeof(rep.th);
 825         if (tsecr) {
 826                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 827                                    (TCPOPT_TIMESTAMP << 8) |
 828                                    TCPOLEN_TIMESTAMP);
 829                 rep.opt[1] = htonl(tsval);
 830                 rep.opt[2] = htonl(tsecr);
 831                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 832         }
 833
 834         /* Swap the send and the receive. */
 835         rep.th.dest    = th->source;
 836         rep.th.source  = th->dest;
 837         rep.th.doff    = arg.iov[0].iov_len / 4;
 838         rep.th.seq     = htonl(seq);
 839         rep.th.ack_seq = htonl(ack);
 840         rep.th.ack     = 1;
 841         rep.th.window  = htons(win);
 842
 843 #ifdef CONFIG_TCP_MD5SIG
 844         if (key) {
 845                 int offset = (tsecr) ? 3 : 0;
 846
 847                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 848                                           (TCPOPT_NOP << 16) |
 849                                           (TCPOPT_MD5SIG << 8) |
 850                                           TCPOLEN_MD5SIG);
 851                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 852                 rep.th.doff = arg.iov[0].iov_len/4;
 853
 854                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 855                                     key, ip_hdr(skb)->saddr,
 856                                     ip_hdr(skb)->daddr, &rep.th);
 857         }
 858 #endif
 859         arg.flags = reply_flags;
 860         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 861                                       ip_hdr(skb)->saddr, /* XXX */
 862                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 863         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 864         if (oif)
 865                 arg.bound_dev_if = oif;
 866         arg.tos = tos;
 867         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 868         local_bh_disable();
 869         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 870         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 871                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 872         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 873                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 874         transmit_time = tcp_transmit_time(sk);
 875         ip_send_unicast_reply(ctl_sk,
 876                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 877                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 878                               &arg, arg.iov[0].iov_len,
 879                               transmit_time);
 880
 881         ctl_sk->sk_mark = 0;
 882         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 883         local_bh_enable();
 884 }
 885
 886 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 887 {
 888         struct inet_timewait_sock *tw = inet_twsk(sk);
 889         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 890
 891         tcp_v4_send_ack(sk, skb,
 892                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 893                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 894                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 895                         tcptw->tw_ts_recent,
 896                         tw->tw_bound_dev_if,
 897                         tcp_twsk_md5_key(tcptw),
 898                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 899                         tw->tw_tos
 900                         );
 901
 902         inet_twsk_put(tw);
 903 }
 904
 905 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 906                                   struct request_sock *req)
 907 {
 908         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 909          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 910          */
 911         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 912                                              tcp_sk(sk)->snd_nxt;
 913
 914         /* RFC 7323 2.3
 915          * The window field (SEG.WND) of every outgoing segment, with the
 916          * exception of <SYN> segments, MUST be right-shifted by
 917          * Rcv.Wind.Shift bits:
 918          */
 919         tcp_v4_send_ack(sk, skb, seq,
 920                         tcp_rsk(req)->rcv_nxt,
 921                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 922                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 923                         req->ts_recent,
 924                         0,
 925                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 926                                           AF_INET),
 927                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 928                         ip_hdr(skb)->tos);
 929 }
 930
 931 /*
 932  *      Send a SYN-ACK after having received a SYN.
 933  *      This still operates on a request_sock only, not on a big
 934  *      socket.
 935  */
 936 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 937                               struct flowi *fl,
 938                               struct request_sock *req,
 939                               struct tcp_fastopen_cookie *foc,
 940                               enum tcp_synack_type synack_type)
 941 {
 942         const struct inet_request_sock *ireq = inet_rsk(req);
 943         struct flowi4 fl4;
 944         int err = -1;
 945         struct sk_buff *skb;
 946
 947         /* First, grab a route. */
 948         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 949                 return -1;
 950
 951         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 952
 953         if (skb) {
 954                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 955
 956                 rcu_read_lock();
 957                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 958                                             ireq->ir_rmt_addr,
 959                                             rcu_dereference(ireq->ireq_opt));
 960                 rcu_read_unlock();
 961                 err = net_xmit_eval(err);
 962         }
 963
 964         return err;
 965 }
 966
 967 /*
 968  *      IPv4 request_sock destructor.
 969  */
 970 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 971 {
 972         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 973 }
 974
 975 #ifdef CONFIG_TCP_MD5SIG
 976 /*
 977  * RFC2385 MD5 checksumming requires a mapping of
 978  * IP address->MD5 Key.
 979  * We need to maintain these in the sk structure.
 980  */
 981
 982 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 983 EXPORT_SYMBOL(tcp_md5_needed);
 984
 985 /* Find the Key structure for an address.  */
 986 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 987                                            const union tcp_md5_addr *addr,
 988                                            int family)
 989 {
 990         const struct tcp_sock *tp = tcp_sk(sk);
 991         struct tcp_md5sig_key *key;
 992         const struct tcp_md5sig_info *md5sig;
 993         __be32 mask;
 994         struct tcp_md5sig_key *best_match = NULL;
 995         bool match;
 996
 997         /* caller either holds rcu_read_lock() or socket lock */
 998         md5sig = rcu_dereference_check(tp->md5sig_info,
 999                                        lockdep_sock_is_held(sk));
1000         if (!md5sig)
1001                 return NULL;
1002
1003         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1004                 if (key->family != family)
1005                         continue;
1006
1007                 if (family == AF_INET) {
1008                         mask = inet_make_mask(key->prefixlen);
1009                         match = (key->addr.a4.s_addr & mask) ==
1010                                 (addr->a4.s_addr & mask);
1011 #if IS_ENABLED(CONFIG_IPV6)
1012                 } else if (family == AF_INET6) {
1013                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1014                                                   key->prefixlen);
1015 #endif
1016                 } else {
1017                         match = false;
1018                 }
1019
1020                 if (match && (!best_match ||
1021                               key->prefixlen > best_match->prefixlen))
1022                         best_match = key;
1023         }
1024         return best_match;
1025 }
1026 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1027
1028 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1029                                                       const union tcp_md5_addr *addr,
1030                                                       int family, u8 prefixlen)
1031 {
1032         const struct tcp_sock *tp = tcp_sk(sk);
1033         struct tcp_md5sig_key *key;
1034         unsigned int size = sizeof(struct in_addr);
1035         const struct tcp_md5sig_info *md5sig;
1036
1037         /* caller either holds rcu_read_lock() or socket lock */
1038         md5sig = rcu_dereference_check(tp->md5sig_info,
1039                                        lockdep_sock_is_held(sk));
1040         if (!md5sig)
1041                 return NULL;
1042 #if IS_ENABLED(CONFIG_IPV6)
1043         if (family == AF_INET6)
1044                 size = sizeof(struct in6_addr);
1045 #endif
1046         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1047                 if (key->family != family)
1048                         continue;
1049                 if (!memcmp(&key->addr, addr, size) &&
1050                     key->prefixlen == prefixlen)
1051                         return key;
1052         }
1053         return NULL;
1054 }
1055
1056 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1057                                          const struct sock *addr_sk)
1058 {
1059         const union tcp_md5_addr *addr;
1060
1061         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1062         return tcp_md5_do_lookup(sk, addr, AF_INET);
1063 }
1064 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1065
1066 /* This can be called on a newly created socket, from other files */
1067 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1068                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1069                    gfp_t gfp)
1070 {
1071         /* Add Key to the list */
1072         struct tcp_md5sig_key *key;
1073         struct tcp_sock *tp = tcp_sk(sk);
1074         struct tcp_md5sig_info *md5sig;
1075
1076         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1077         if (key) {
1078                 /* Pre-existing entry - just update that one. */
1079                 memcpy(key->key, newkey, newkeylen);
1080                 key->keylen = newkeylen;
1081                 return 0;
1082         }
1083
1084         md5sig = rcu_dereference_protected(tp->md5sig_info,
1085                                            lockdep_sock_is_held(sk));
1086         if (!md5sig) {
1087                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1088                 if (!md5sig)
1089                         return -ENOMEM;
1090
1091                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1092                 INIT_HLIST_HEAD(&md5sig->head);
1093                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1094         }
1095
1096         key = sock_kmalloc(sk, sizeof(*key), gfp);
1097         if (!key)
1098                 return -ENOMEM;
1099         if (!tcp_alloc_md5sig_pool()) {
1100                 sock_kfree_s(sk, key, sizeof(*key));
1101                 return -ENOMEM;
1102         }
1103
1104         memcpy(key->key, newkey, newkeylen);
1105         key->keylen = newkeylen;
1106         key->family = family;
1107         key->prefixlen = prefixlen;
1108         memcpy(&key->addr, addr,
1109                (family == AF_INET6) ? sizeof(struct in6_addr) :
1110                                       sizeof(struct in_addr));
1111         hlist_add_head_rcu(&key->node, &md5sig->head);
1112         return 0;
1113 }
1114 EXPORT_SYMBOL(tcp_md5_do_add);
1115
1116 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117                    u8 prefixlen)
1118 {
1119         struct tcp_md5sig_key *key;
1120
1121         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1122         if (!key)
1123                 return -ENOENT;
1124         hlist_del_rcu(&key->node);
1125         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1126         kfree_rcu(key, rcu);
1127         return 0;
1128 }
1129 EXPORT_SYMBOL(tcp_md5_do_del);
1130
1131 static void tcp_clear_md5_list(struct sock *sk)
1132 {
1133         struct tcp_sock *tp = tcp_sk(sk);
1134         struct tcp_md5sig_key *key;
1135         struct hlist_node *n;
1136         struct tcp_md5sig_info *md5sig;
1137
1138         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1139
1140         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1141                 hlist_del_rcu(&key->node);
1142                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1143                 kfree_rcu(key, rcu);
1144         }
1145 }
1146
1147 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148                                  char __user *optval, int optlen)
1149 {
1150         struct tcp_md5sig cmd;
1151         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1152         u8 prefixlen = 32;
1153
1154         if (optlen < sizeof(cmd))
1155                 return -EINVAL;
1156
1157         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1158                 return -EFAULT;
1159
1160         if (sin->sin_family != AF_INET)
1161                 return -EINVAL;
1162
1163         if (optname == TCP_MD5SIG_EXT &&
1164             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1165                 prefixlen = cmd.tcpm_prefixlen;
1166                 if (prefixlen > 32)
1167                         return -EINVAL;
1168         }
1169
1170         if (!cmd.tcpm_keylen)
1171                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172                                       AF_INET, prefixlen);
1173
1174         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1175                 return -EINVAL;
1176
1177         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1179                               GFP_KERNEL);
1180 }
1181
1182 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1183                                    __be32 daddr, __be32 saddr,
1184                                    const struct tcphdr *th, int nbytes)
1185 {
1186         struct tcp4_pseudohdr *bp;
1187         struct scatterlist sg;
1188         struct tcphdr *_th;
1189
1190         bp = hp->scratch;
1191         bp->saddr = saddr;
1192         bp->daddr = daddr;
1193         bp->pad = 0;
1194         bp->protocol = IPPROTO_TCP;
1195         bp->len = cpu_to_be16(nbytes);
1196
1197         _th = (struct tcphdr *)(bp + 1);
1198         memcpy(_th, th, sizeof(*th));
1199         _th->check = 0;
1200
1201         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1202         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1203                                 sizeof(*bp) + sizeof(*th));
1204         return crypto_ahash_update(hp->md5_req);
1205 }
1206
1207 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1208                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1209 {
1210         struct tcp_md5sig_pool *hp;
1211         struct ahash_request *req;
1212
1213         hp = tcp_get_md5sig_pool();
1214         if (!hp)
1215                 goto clear_hash_noput;
1216         req = hp->md5_req;
1217
1218         if (crypto_ahash_init(req))
1219                 goto clear_hash;
1220         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1221                 goto clear_hash;
1222         if (tcp_md5_hash_key(hp, key))
1223                 goto clear_hash;
1224         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1225         if (crypto_ahash_final(req))
1226                 goto clear_hash;
1227
1228         tcp_put_md5sig_pool();
1229         return 0;
1230
1231 clear_hash:
1232         tcp_put_md5sig_pool();
1233 clear_hash_noput:
1234         memset(md5_hash, 0, 16);
1235         return 1;
1236 }
1237
1238 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1239                         const struct sock *sk,
1240                         const struct sk_buff *skb)
1241 {
1242         struct tcp_md5sig_pool *hp;
1243         struct ahash_request *req;
1244         const struct tcphdr *th = tcp_hdr(skb);
1245         __be32 saddr, daddr;
1246
1247         if (sk) { /* valid for establish/request sockets */
1248                 saddr = sk->sk_rcv_saddr;
1249                 daddr = sk->sk_daddr;
1250         } else {
1251                 const struct iphdr *iph = ip_hdr(skb);
1252                 saddr = iph->saddr;
1253                 daddr = iph->daddr;
1254         }
1255
1256         hp = tcp_get_md5sig_pool();
1257         if (!hp)
1258                 goto clear_hash_noput;
1259         req = hp->md5_req;
1260
1261         if (crypto_ahash_init(req))
1262                 goto clear_hash;
1263
1264         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1265                 goto clear_hash;
1266         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1267                 goto clear_hash;
1268         if (tcp_md5_hash_key(hp, key))
1269                 goto clear_hash;
1270         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1271         if (crypto_ahash_final(req))
1272                 goto clear_hash;
1273
1274         tcp_put_md5sig_pool();
1275         return 0;
1276
1277 clear_hash:
1278         tcp_put_md5sig_pool();
1279 clear_hash_noput:
1280         memset(md5_hash, 0, 16);
1281         return 1;
1282 }
1283 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1284
1285 #endif
1286
1287 /* Called with rcu_read_lock() */
1288 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1289                                     const struct sk_buff *skb)
1290 {
1291 #ifdef CONFIG_TCP_MD5SIG
1292         /*
1293          * This gets called for each TCP segment that arrives
1294          * so we want to be efficient.
1295          * We have 3 drop cases:
1296          * o No MD5 hash and one expected.
1297          * o MD5 hash and we're not expecting one.
1298          * o MD5 hash and its wrong.
1299          */
1300         const __u8 *hash_location = NULL;
1301         struct tcp_md5sig_key *hash_expected;
1302         const struct iphdr *iph = ip_hdr(skb);
1303         const struct tcphdr *th = tcp_hdr(skb);
1304         int genhash;
1305         unsigned char newhash[16];
1306
1307         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308                                           AF_INET);
1309         hash_location = tcp_parse_md5sig_option(th);
1310
1311         /* We've parsed the options - do we have a hash? */
1312         if (!hash_expected && !hash_location)
1313                 return false;
1314
1315         if (hash_expected && !hash_location) {
1316                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1317                 return true;
1318         }
1319
1320         if (!hash_expected && hash_location) {
1321                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1322                 return true;
1323         }
1324
1325         /* Okay, so this is hash_expected and hash_location -
1326          * so we need to calculate the checksum.
1327          */
1328         genhash = tcp_v4_md5_hash_skb(newhash,
1329                                       hash_expected,
1330                                       NULL, skb);
1331
1332         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1333                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1334                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1335                                      &iph->saddr, ntohs(th->source),
1336                                      &iph->daddr, ntohs(th->dest),
1337                                      genhash ? " tcp_v4_calc_md5_hash failed"
1338                                      : "");
1339                 return true;
1340         }
1341         return false;
1342 #endif
1343         return false;
1344 }
1345
1346 static void tcp_v4_init_req(struct request_sock *req,
1347                             const struct sock *sk_listener,
1348                             struct sk_buff *skb)
1349 {
1350         struct inet_request_sock *ireq = inet_rsk(req);
1351         struct net *net = sock_net(sk_listener);
1352
1353         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1354         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1355         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1356 }
1357
1358 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1359                                           struct flowi *fl,
1360                                           const struct request_sock *req)
1361 {
1362         return inet_csk_route_req(sk, &fl->u.ip4, req);
1363 }
1364
1365 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1366         .family         =       PF_INET,
1367         .obj_size       =       sizeof(struct tcp_request_sock),
1368         .rtx_syn_ack    =       tcp_rtx_synack,
1369         .send_ack       =       tcp_v4_reqsk_send_ack,
1370         .destructor     =       tcp_v4_reqsk_destructor,
1371         .send_reset     =       tcp_v4_send_reset,
1372         .syn_ack_timeout =      tcp_syn_ack_timeout,
1373 };
1374
1375 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1376         .mss_clamp      =       TCP_MSS_DEFAULT,
1377 #ifdef CONFIG_TCP_MD5SIG
1378         .req_md5_lookup =       tcp_v4_md5_lookup,
1379         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1380 #endif
1381         .init_req       =       tcp_v4_init_req,
1382 #ifdef CONFIG_SYN_COOKIES
1383         .cookie_init_seq =      cookie_v4_init_sequence,
1384 #endif
1385         .route_req      =       tcp_v4_route_req,
1386         .init_seq       =       tcp_v4_init_seq,
1387         .init_ts_off    =       tcp_v4_init_ts_off,
1388         .send_synack    =       tcp_v4_send_synack,
1389 };
1390
1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392 {
1393         /* Never answer to SYNs send to broadcast or multicast */
1394         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1395                 goto drop;
1396
1397         return tcp_conn_request(&tcp_request_sock_ops,
1398                                 &tcp_request_sock_ipv4_ops, sk, skb);
1399
1400 drop:
1401         tcp_listendrop(sk);
1402         return 0;
1403 }
1404 EXPORT_SYMBOL(tcp_v4_conn_request);
1405
1406
1407 /*
1408  * The three way handshake has completed - we got a valid synack -
1409  * now create the new socket.
1410  */
1411 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1412                                   struct request_sock *req,
1413                                   struct dst_entry *dst,
1414                                   struct request_sock *req_unhash,
1415                                   bool *own_req)
1416 {
1417         struct inet_request_sock *ireq;
1418         struct inet_sock *newinet;
1419         struct tcp_sock *newtp;
1420         struct sock *newsk;
1421 #ifdef CONFIG_TCP_MD5SIG
1422         struct tcp_md5sig_key *key;
1423 #endif
1424         struct ip_options_rcu *inet_opt;
1425
1426         if (sk_acceptq_is_full(sk))
1427                 goto exit_overflow;
1428
1429         newsk = tcp_create_openreq_child(sk, req, skb);
1430         if (!newsk)
1431                 goto exit_nonewsk;
1432
1433         newsk->sk_gso_type = SKB_GSO_TCPV4;
1434         inet_sk_rx_dst_set(newsk, skb);
1435
1436         newtp                 = tcp_sk(newsk);
1437         newinet               = inet_sk(newsk);
1438         ireq                  = inet_rsk(req);
1439         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1440         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1441         newsk->sk_bound_dev_if = ireq->ir_iif;
1442         newinet->inet_saddr   = ireq->ir_loc_addr;
1443         inet_opt              = rcu_dereference(ireq->ireq_opt);
1444         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1445         newinet->mc_index     = inet_iif(skb);
1446         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1447         newinet->rcv_tos      = ip_hdr(skb)->tos;
1448         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1449         if (inet_opt)
1450                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1451         newinet->inet_id = prandom_u32();
1452
1453         if (!dst) {
1454                 dst = inet_csk_route_child_sock(sk, newsk, req);
1455                 if (!dst)
1456                         goto put_and_exit;
1457         } else {
1458                 /* syncookie case : see end of cookie_v4_check() */
1459         }
1460         sk_setup_caps(newsk, dst);
1461
1462         tcp_ca_openreq_child(newsk, dst);
1463
1464         tcp_sync_mss(newsk, dst_mtu(dst));
1465         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1466
1467         tcp_initialize_rcv_mss(newsk);
1468
1469 #ifdef CONFIG_TCP_MD5SIG
1470         /* Copy over the MD5 key from the original socket */
1471         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472                                 AF_INET);
1473         if (key) {
1474                 /*
1475                  * We're using one, so create a matching key
1476                  * on the newsk structure. If we fail to get
1477                  * memory, then we end up not copying the key
1478                  * across. Shucks.
1479                  */
1480                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1481                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1482                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1483         }
1484 #endif
1485
1486         if (__inet_inherit_port(sk, newsk) < 0)
1487                 goto put_and_exit;
1488         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1489         if (likely(*own_req)) {
1490                 tcp_move_syn(newtp, req);
1491                 ireq->ireq_opt = NULL;
1492         } else {
1493                 newinet->inet_opt = NULL;
1494         }
1495         return newsk;
1496
1497 exit_overflow:
1498         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1499 exit_nonewsk:
1500         dst_release(dst);
1501 exit:
1502         tcp_listendrop(sk);
1503         return NULL;
1504 put_and_exit:
1505         newinet->inet_opt = NULL;
1506         inet_csk_prepare_forced_close(newsk);
1507         tcp_done(newsk);
1508         goto exit;
1509 }
1510 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1511
1512 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1513 {
1514 #ifdef CONFIG_SYN_COOKIES
1515         const struct tcphdr *th = tcp_hdr(skb);
1516
1517         if (!th->syn)
1518                 sk = cookie_v4_check(sk, skb);
1519 #endif
1520         return sk;
1521 }
1522
1523 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1524                          struct tcphdr *th, u32 *cookie)
1525 {
1526         u16 mss = 0;
1527 #ifdef CONFIG_SYN_COOKIES
1528         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1529                                     &tcp_request_sock_ipv4_ops, sk, th);
1530         if (mss) {
1531                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1532                 tcp_synq_overflow(sk);
1533         }
1534 #endif
1535         return mss;
1536 }
1537
1538 /* The socket must have it's spinlock held when we get
1539  * here, unless it is a TCP_LISTEN socket.
1540  *
1541  * We have a potential double-lock case here, so even when
1542  * doing backlog processing we use the BH locking scheme.
1543  * This is because we cannot sleep with the original spinlock
1544  * held.
1545  */
1546 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547 {
1548         struct sock *rsk;
1549
1550         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1551                 struct dst_entry *dst = sk->sk_rx_dst;
1552
1553                 sock_rps_save_rxhash(sk, skb);
1554                 sk_mark_napi_id(sk, skb);
1555                 if (dst) {
1556                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1557                             !dst->ops->check(dst, 0)) {
1558                                 dst_release(dst);
1559                                 sk->sk_rx_dst = NULL;
1560                         }
1561                 }
1562                 tcp_rcv_established(sk, skb);
1563                 return 0;
1564         }
1565
1566         if (tcp_checksum_complete(skb))
1567                 goto csum_err;
1568
1569         if (sk->sk_state == TCP_LISTEN) {
1570                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1571
1572                 if (!nsk)
1573                         goto discard;
1574                 if (nsk != sk) {
1575                         if (tcp_child_process(sk, nsk, skb)) {
1576                                 rsk = nsk;
1577                                 goto reset;
1578                         }
1579                         return 0;
1580                 }
1581         } else
1582                 sock_rps_save_rxhash(sk, skb);
1583
1584         if (tcp_rcv_state_process(sk, skb)) {
1585                 rsk = sk;
1586                 goto reset;
1587         }
1588         return 0;
1589
1590 reset:
1591         tcp_v4_send_reset(rsk, skb);
1592 discard:
1593         kfree_skb(skb);
1594         /* Be careful here. If this function gets more complicated and
1595          * gcc suffers from register pressure on the x86, sk (in %ebx)
1596          * might be destroyed here. This current version compiles correctly,
1597          * but you have been warned.
1598          */
1599         return 0;
1600
1601 csum_err:
1602         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1603         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1604         goto discard;
1605 }
1606 EXPORT_SYMBOL(tcp_v4_do_rcv);
1607
1608 int tcp_v4_early_demux(struct sk_buff *skb)
1609 {
1610         const struct iphdr *iph;
1611         const struct tcphdr *th;
1612         struct sock *sk;
1613
1614         if (skb->pkt_type != PACKET_HOST)
1615                 return 0;
1616
1617         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1618                 return 0;
1619
1620         iph = ip_hdr(skb);
1621         th = tcp_hdr(skb);
1622
1623         if (th->doff < sizeof(struct tcphdr) / 4)
1624                 return 0;
1625
1626         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1627                                        iph->saddr, th->source,
1628                                        iph->daddr, ntohs(th->dest),
1629                                        skb->skb_iif, inet_sdif(skb));
1630         if (sk) {
1631                 skb->sk = sk;
1632                 skb->destructor = sock_edemux;
1633                 if (sk_fullsock(sk)) {
1634                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1635
1636                         if (dst)
1637                                 dst = dst_check(dst, 0);
1638                         if (dst &&
1639                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1640                                 skb_dst_set_noref(skb, dst);
1641                 }
1642         }
1643         return 0;
1644 }
1645
1646 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1647 {
1648         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1649         struct skb_shared_info *shinfo;
1650         const struct tcphdr *th;
1651         struct tcphdr *thtail;
1652         struct sk_buff *tail;
1653         unsigned int hdrlen;
1654         bool fragstolen;
1655         u32 gso_segs;
1656         int delta;
1657
1658         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1659          * we can fix skb->truesize to its real value to avoid future drops.
1660          * This is valid because skb is not yet charged to the socket.
1661          * It has been noticed pure SACK packets were sometimes dropped
1662          * (if cooked by drivers without copybreak feature).
1663          */
1664         skb_condense(skb);
1665
1666         skb_dst_drop(skb);
1667
1668         if (unlikely(tcp_checksum_complete(skb))) {
1669                 bh_unlock_sock(sk);
1670                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1671                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1672                 return true;
1673         }
1674
1675         /* Attempt coalescing to last skb in backlog, even if we are
1676          * above the limits.
1677          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1678          */
1679         th = (const struct tcphdr *)skb->data;
1680         hdrlen = th->doff * 4;
1681         shinfo = skb_shinfo(skb);
1682
1683         if (!shinfo->gso_size)
1684                 shinfo->gso_size = skb->len - hdrlen;
1685
1686         if (!shinfo->gso_segs)
1687                 shinfo->gso_segs = 1;
1688
1689         tail = sk->sk_backlog.tail;
1690         if (!tail)
1691                 goto no_coalesce;
1692         thtail = (struct tcphdr *)tail->data;
1693
1694         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1695             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1696             ((TCP_SKB_CB(tail)->tcp_flags |
1697               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1698             !((TCP_SKB_CB(tail)->tcp_flags &
1699               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1700             ((TCP_SKB_CB(tail)->tcp_flags ^
1701               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1702 #ifdef CONFIG_TLS_DEVICE
1703             tail->decrypted != skb->decrypted ||
1704 #endif
1705             thtail->doff != th->doff ||
1706             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1707                 goto no_coalesce;
1708
1709         __skb_pull(skb, hdrlen);
1710         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1711                 thtail->window = th->window;
1712
1713                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1714
1715                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1716                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1717
1718                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1719                  * thtail->fin, so that the fast path in tcp_rcv_established()
1720                  * is not entered if we append a packet with a FIN.
1721                  * SYN, RST, URG are not present.
1722                  * ACK is set on both packets.
1723                  * PSH : we do not really care in TCP stack,
1724                  *       at least for 'GRO' packets.
1725                  */
1726                 thtail->fin |= th->fin;
1727                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1728
1729                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1730                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1731                         tail->tstamp = skb->tstamp;
1732                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1733                 }
1734
1735                 /* Not as strict as GRO. We only need to carry mss max value */
1736                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1737                                                  skb_shinfo(tail)->gso_size);
1738
1739                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1740                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1741
1742                 sk->sk_backlog.len += delta;
1743                 __NET_INC_STATS(sock_net(sk),
1744                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1745                 kfree_skb_partial(skb, fragstolen);
1746                 return false;
1747         }
1748         __skb_push(skb, hdrlen);
1749
1750 no_coalesce:
1751         /* Only socket owner can try to collapse/prune rx queues
1752          * to reduce memory overhead, so add a little headroom here.
1753          * Few sockets backlog are possibly concurrently non empty.
1754          */
1755         limit += 64*1024;
1756
1757         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1758                 bh_unlock_sock(sk);
1759                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1760                 return true;
1761         }
1762         return false;
1763 }
1764 EXPORT_SYMBOL(tcp_add_backlog);
1765
1766 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1767 {
1768         struct tcphdr *th = (struct tcphdr *)skb->data;
1769
1770         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1771 }
1772 EXPORT_SYMBOL(tcp_filter);
1773
1774 static void tcp_v4_restore_cb(struct sk_buff *skb)
1775 {
1776         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1777                 sizeof(struct inet_skb_parm));
1778 }
1779
1780 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1781                            const struct tcphdr *th)
1782 {
1783         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1784          * barrier() makes sure compiler wont play fool^Waliasing games.
1785          */
1786         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1787                 sizeof(struct inet_skb_parm));
1788         barrier();
1789
1790         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1791         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1792                                     skb->len - th->doff * 4);
1793         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1794         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1795         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1796         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1797         TCP_SKB_CB(skb)->sacked  = 0;
1798         TCP_SKB_CB(skb)->has_rxtstamp =
1799                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1800 }
1801
1802 /*
1803  *      From tcp_input.c
1804  */
1805
1806 int tcp_v4_rcv(struct sk_buff *skb)
1807 {
1808         struct net *net = dev_net(skb->dev);
1809         struct sk_buff *skb_to_free;
1810         int sdif = inet_sdif(skb);
1811         const struct iphdr *iph;
1812         const struct tcphdr *th;
1813         bool refcounted;
1814         struct sock *sk;
1815         int ret;
1816
1817         if (skb->pkt_type != PACKET_HOST)
1818                 goto discard_it;
1819
1820         /* Count it even if it's bad */
1821         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1822
1823         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1824                 goto discard_it;
1825
1826         th = (const struct tcphdr *)skb->data;
1827
1828         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1829                 goto bad_packet;
1830         if (!pskb_may_pull(skb, th->doff * 4))
1831                 goto discard_it;
1832
1833         /* An explanation is required here, I think.
1834          * Packet length and doff are validated by header prediction,
1835          * provided case of th->doff==0 is eliminated.
1836          * So, we defer the checks. */
1837
1838         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1839                 goto csum_error;
1840
1841         th = (const struct tcphdr *)skb->data;
1842         iph = ip_hdr(skb);
1843 lookup:
1844         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1845                                th->dest, sdif, &refcounted);
1846         if (!sk)
1847                 goto no_tcp_socket;
1848
1849 process:
1850         if (sk->sk_state == TCP_TIME_WAIT)
1851                 goto do_time_wait;
1852
1853         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1854                 struct request_sock *req = inet_reqsk(sk);
1855                 bool req_stolen = false;
1856                 struct sock *nsk;
1857
1858                 sk = req->rsk_listener;
1859                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1860                         sk_drops_add(sk, skb);
1861                         reqsk_put(req);
1862                         goto discard_it;
1863                 }
1864                 if (tcp_checksum_complete(skb)) {
1865                         reqsk_put(req);
1866                         goto csum_error;
1867                 }
1868                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1869                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1870                         goto lookup;
1871                 }
1872                 /* We own a reference on the listener, increase it again
1873                  * as we might lose it too soon.
1874                  */
1875                 sock_hold(sk);
1876                 refcounted = true;
1877                 nsk = NULL;
1878                 if (!tcp_filter(sk, skb)) {
1879                         th = (const struct tcphdr *)skb->data;
1880                         iph = ip_hdr(skb);
1881                         tcp_v4_fill_cb(skb, iph, th);
1882                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1883                 }
1884                 if (!nsk) {
1885                         reqsk_put(req);
1886                         if (req_stolen) {
1887                                 /* Another cpu got exclusive access to req
1888                                  * and created a full blown socket.
1889                                  * Try to feed this packet to this socket
1890                                  * instead of discarding it.
1891                                  */
1892                                 tcp_v4_restore_cb(skb);
1893                                 sock_put(sk);
1894                                 goto lookup;
1895                         }
1896                         goto discard_and_relse;
1897                 }
1898                 if (nsk == sk) {
1899                         reqsk_put(req);
1900                         tcp_v4_restore_cb(skb);
1901                 } else if (tcp_child_process(sk, nsk, skb)) {
1902                         tcp_v4_send_reset(nsk, skb);
1903                         goto discard_and_relse;
1904                 } else {
1905                         sock_put(sk);
1906                         return 0;
1907                 }
1908         }
1909         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1910                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1911                 goto discard_and_relse;
1912         }
1913
1914         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1915                 goto discard_and_relse;
1916
1917         if (tcp_v4_inbound_md5_hash(sk, skb))
1918                 goto discard_and_relse;
1919
1920         nf_reset_ct(skb);
1921
1922         if (tcp_filter(sk, skb))
1923                 goto discard_and_relse;
1924         th = (const struct tcphdr *)skb->data;
1925         iph = ip_hdr(skb);
1926         tcp_v4_fill_cb(skb, iph, th);
1927
1928         skb->dev = NULL;
1929
1930         if (sk->sk_state == TCP_LISTEN) {
1931                 ret = tcp_v4_do_rcv(sk, skb);
1932                 goto put_and_return;
1933         }
1934
1935         sk_incoming_cpu_update(sk);
1936
1937         bh_lock_sock_nested(sk);
1938         tcp_segs_in(tcp_sk(sk), skb);
1939         ret = 0;
1940         if (!sock_owned_by_user(sk)) {
1941                 skb_to_free = sk->sk_rx_skb_cache;
1942                 sk->sk_rx_skb_cache = NULL;
1943                 ret = tcp_v4_do_rcv(sk, skb);
1944         } else {
1945                 if (tcp_add_backlog(sk, skb))
1946                         goto discard_and_relse;
1947                 skb_to_free = NULL;
1948         }
1949         bh_unlock_sock(sk);
1950         if (skb_to_free)
1951                 __kfree_skb(skb_to_free);
1952
1953 put_and_return:
1954         if (refcounted)
1955                 sock_put(sk);
1956
1957         return ret;
1958
1959 no_tcp_socket:
1960         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1961                 goto discard_it;
1962
1963         tcp_v4_fill_cb(skb, iph, th);
1964
1965         if (tcp_checksum_complete(skb)) {
1966 csum_error:
1967                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1968 bad_packet:
1969                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1970         } else {
1971                 tcp_v4_send_reset(NULL, skb);
1972         }
1973
1974 discard_it:
1975         /* Discard frame. */
1976         kfree_skb(skb);
1977         return 0;
1978
1979 discard_and_relse:
1980         sk_drops_add(sk, skb);
1981         if (refcounted)
1982                 sock_put(sk);
1983         goto discard_it;
1984
1985 do_time_wait:
1986         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1987                 inet_twsk_put(inet_twsk(sk));
1988                 goto discard_it;
1989         }
1990
1991         tcp_v4_fill_cb(skb, iph, th);
1992
1993         if (tcp_checksum_complete(skb)) {
1994                 inet_twsk_put(inet_twsk(sk));
1995                 goto csum_error;
1996         }
1997         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1998         case TCP_TW_SYN: {
1999                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2000                                                         &tcp_hashinfo, skb,
2001                                                         __tcp_hdrlen(th),
2002                                                         iph->saddr, th->source,
2003                                                         iph->daddr, th->dest,
2004                                                         inet_iif(skb),
2005                                                         sdif);
2006                 if (sk2) {
2007                         inet_twsk_deschedule_put(inet_twsk(sk));
2008                         sk = sk2;
2009                         tcp_v4_restore_cb(skb);
2010                         refcounted = false;
2011                         goto process;
2012                 }
2013         }
2014                 /* to ACK */
2015                 /* fall through */
2016         case TCP_TW_ACK:
2017                 tcp_v4_timewait_ack(sk, skb);
2018                 break;
2019         case TCP_TW_RST:
2020                 tcp_v4_send_reset(sk, skb);
2021                 inet_twsk_deschedule_put(inet_twsk(sk));
2022                 goto discard_it;
2023         case TCP_TW_SUCCESS:;
2024         }
2025         goto discard_it;
2026 }
2027
2028 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2029         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2030         .twsk_unique    = tcp_twsk_unique,
2031         .twsk_destructor= tcp_twsk_destructor,
2032 };
2033
2034 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2035 {
2036         struct dst_entry *dst = skb_dst(skb);
2037
2038         if (dst && dst_hold_safe(dst)) {
2039                 sk->sk_rx_dst = dst;
2040                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2041         }
2042 }
2043 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2044
2045 const struct inet_connection_sock_af_ops ipv4_specific = {
2046         .queue_xmit        = ip_queue_xmit,
2047         .send_check        = tcp_v4_send_check,
2048         .rebuild_header    = inet_sk_rebuild_header,
2049         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2050         .conn_request      = tcp_v4_conn_request,
2051         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2052         .net_header_len    = sizeof(struct iphdr),
2053         .setsockopt        = ip_setsockopt,
2054         .getsockopt        = ip_getsockopt,
2055         .addr2sockaddr     = inet_csk_addr2sockaddr,
2056         .sockaddr_len      = sizeof(struct sockaddr_in),
2057 #ifdef CONFIG_COMPAT
2058         .compat_setsockopt = compat_ip_setsockopt,
2059         .compat_getsockopt = compat_ip_getsockopt,
2060 #endif
2061         .mtu_reduced       = tcp_v4_mtu_reduced,
2062 };
2063 EXPORT_SYMBOL(ipv4_specific);
2064
2065 #ifdef CONFIG_TCP_MD5SIG
2066 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2067         .md5_lookup             = tcp_v4_md5_lookup,
2068         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2069         .md5_parse              = tcp_v4_parse_md5_keys,
2070 };
2071 #endif
2072
2073 /* NOTE: A lot of things set to zero explicitly by call to
2074  *       sk_alloc() so need not be done here.
2075  */
2076 static int tcp_v4_init_sock(struct sock *sk)
2077 {
2078         struct inet_connection_sock *icsk = inet_csk(sk);
2079
2080         tcp_init_sock(sk);
2081
2082         icsk->icsk_af_ops = &ipv4_specific;
2083
2084 #ifdef CONFIG_TCP_MD5SIG
2085         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2086 #endif
2087
2088         return 0;
2089 }
2090
2091 void tcp_v4_destroy_sock(struct sock *sk)
2092 {
2093         struct tcp_sock *tp = tcp_sk(sk);
2094
2095         trace_tcp_destroy_sock(sk);
2096
2097         tcp_clear_xmit_timers(sk);
2098
2099         tcp_cleanup_congestion_control(sk);
2100
2101         tcp_cleanup_ulp(sk);
2102
2103         /* Cleanup up the write buffer. */
2104         tcp_write_queue_purge(sk);
2105
2106         /* Check if we want to disable active TFO */
2107         tcp_fastopen_active_disable_ofo_check(sk);
2108
2109         /* Cleans up our, hopefully empty, out_of_order_queue. */
2110         skb_rbtree_purge(&tp->out_of_order_queue);
2111
2112 #ifdef CONFIG_TCP_MD5SIG
2113         /* Clean up the MD5 key list, if any */
2114         if (tp->md5sig_info) {
2115                 tcp_clear_md5_list(sk);
2116                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2117                 tp->md5sig_info = NULL;
2118         }
2119 #endif
2120
2121         /* Clean up a referenced TCP bind bucket. */
2122         if (inet_csk(sk)->icsk_bind_hash)
2123                 inet_put_port(sk);
2124
2125         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2126
2127         /* If socket is aborted during connect operation */
2128         tcp_free_fastopen_req(tp);
2129         tcp_fastopen_destroy_cipher(sk);
2130         tcp_saved_syn_free(tp);
2131
2132         sk_sockets_allocated_dec(sk);
2133 }
2134 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2135
2136 #ifdef CONFIG_PROC_FS
2137 /* Proc filesystem TCP sock list dumping. */
2138
2139 /*
2140  * Get next listener socket follow cur.  If cur is NULL, get first socket
2141  * starting from bucket given in st->bucket; when st->bucket is zero the
2142  * very first socket in the hash table is returned.
2143  */
2144 static void *listening_get_next(struct seq_file *seq, void *cur)
2145 {
2146         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2147         struct tcp_iter_state *st = seq->private;
2148         struct net *net = seq_file_net(seq);
2149         struct inet_listen_hashbucket *ilb;
2150         struct hlist_nulls_node *node;
2151         struct sock *sk = cur;
2152
2153         if (!sk) {
2154 get_head:
2155                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2156                 spin_lock(&ilb->lock);
2157                 sk = sk_nulls_head(&ilb->nulls_head);
2158                 st->offset = 0;
2159                 goto get_sk;
2160         }
2161         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2162         ++st->num;
2163         ++st->offset;
2164
2165         sk = sk_nulls_next(sk);
2166 get_sk:
2167         sk_nulls_for_each_from(sk, node) {
2168                 if (!net_eq(sock_net(sk), net))
2169                         continue;
2170                 if (sk->sk_family == afinfo->family)
2171                         return sk;
2172         }
2173         spin_unlock(&ilb->lock);
2174         st->offset = 0;
2175         if (++st->bucket < INET_LHTABLE_SIZE)
2176                 goto get_head;
2177         return NULL;
2178 }
2179
2180 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2181 {
2182         struct tcp_iter_state *st = seq->private;
2183         void *rc;
2184
2185         st->bucket = 0;
2186         st->offset = 0;
2187         rc = listening_get_next(seq, NULL);
2188
2189         while (rc && *pos) {
2190                 rc = listening_get_next(seq, rc);
2191                 --*pos;
2192         }
2193         return rc;
2194 }
2195
2196 static inline bool empty_bucket(const struct tcp_iter_state *st)
2197 {
2198         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2199 }
2200
2201 /*
2202  * Get first established socket starting from bucket given in st->bucket.
2203  * If st->bucket is zero, the very first socket in the hash is returned.
2204  */
2205 static void *established_get_first(struct seq_file *seq)
2206 {
2207         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2208         struct tcp_iter_state *st = seq->private;
2209         struct net *net = seq_file_net(seq);
2210         void *rc = NULL;
2211
2212         st->offset = 0;
2213         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2214                 struct sock *sk;
2215                 struct hlist_nulls_node *node;
2216                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2217
2218                 /* Lockless fast path for the common case of empty buckets */
2219                 if (empty_bucket(st))
2220                         continue;
2221
2222                 spin_lock_bh(lock);
2223                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2224                         if (sk->sk_family != afinfo->family ||
2225                             !net_eq(sock_net(sk), net)) {
2226                                 continue;
2227                         }
2228                         rc = sk;
2229                         goto out;
2230                 }
2231                 spin_unlock_bh(lock);
2232         }
2233 out:
2234         return rc;
2235 }
2236
2237 static void *established_get_next(struct seq_file *seq, void *cur)
2238 {
2239         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2240         struct sock *sk = cur;
2241         struct hlist_nulls_node *node;
2242         struct tcp_iter_state *st = seq->private;
2243         struct net *net = seq_file_net(seq);
2244
2245         ++st->num;
2246         ++st->offset;
2247
2248         sk = sk_nulls_next(sk);
2249
2250         sk_nulls_for_each_from(sk, node) {
2251                 if (sk->sk_family == afinfo->family &&
2252                     net_eq(sock_net(sk), net))
2253                         return sk;
2254         }
2255
2256         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2257         ++st->bucket;
2258         return established_get_first(seq);
2259 }
2260
2261 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2262 {
2263         struct tcp_iter_state *st = seq->private;
2264         void *rc;
2265
2266         st->bucket = 0;
2267         rc = established_get_first(seq);
2268
2269         while (rc && pos) {
2270                 rc = established_get_next(seq, rc);
2271                 --pos;
2272         }
2273         return rc;
2274 }
2275
2276 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2277 {
2278         void *rc;
2279         struct tcp_iter_state *st = seq->private;
2280
2281         st->state = TCP_SEQ_STATE_LISTENING;
2282         rc        = listening_get_idx(seq, &pos);
2283
2284         if (!rc) {
2285                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2286                 rc        = established_get_idx(seq, pos);
2287         }
2288
2289         return rc;
2290 }
2291
2292 static void *tcp_seek_last_pos(struct seq_file *seq)
2293 {
2294         struct tcp_iter_state *st = seq->private;
2295         int offset = st->offset;
2296         int orig_num = st->num;
2297         void *rc = NULL;
2298
2299         switch (st->state) {
2300         case TCP_SEQ_STATE_LISTENING:
2301                 if (st->bucket >= INET_LHTABLE_SIZE)
2302                         break;
2303                 st->state = TCP_SEQ_STATE_LISTENING;
2304                 rc = listening_get_next(seq, NULL);
2305                 while (offset-- && rc)
2306                         rc = listening_get_next(seq, rc);
2307                 if (rc)
2308                         break;
2309                 st->bucket = 0;
2310                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2311                 /* Fallthrough */
2312         case TCP_SEQ_STATE_ESTABLISHED:
2313                 if (st->bucket > tcp_hashinfo.ehash_mask)
2314                         break;
2315                 rc = established_get_first(seq);
2316                 while (offset-- && rc)
2317                         rc = established_get_next(seq, rc);
2318         }
2319
2320         st->num = orig_num;
2321
2322         return rc;
2323 }
2324
2325 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2326 {
2327         struct tcp_iter_state *st = seq->private;
2328         void *rc;
2329
2330         if (*pos && *pos == st->last_pos) {
2331                 rc = tcp_seek_last_pos(seq);
2332                 if (rc)
2333                         goto out;
2334         }
2335
2336         st->state = TCP_SEQ_STATE_LISTENING;
2337         st->num = 0;
2338         st->bucket = 0;
2339         st->offset = 0;
2340         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2341
2342 out:
2343         st->last_pos = *pos;
2344         return rc;
2345 }
2346 EXPORT_SYMBOL(tcp_seq_start);
2347
2348 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2349 {
2350         struct tcp_iter_state *st = seq->private;
2351         void *rc = NULL;
2352
2353         if (v == SEQ_START_TOKEN) {
2354                 rc = tcp_get_idx(seq, 0);
2355                 goto out;
2356         }
2357
2358         switch (st->state) {
2359         case TCP_SEQ_STATE_LISTENING:
2360                 rc = listening_get_next(seq, v);
2361                 if (!rc) {
2362                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2363                         st->bucket = 0;
2364                         st->offset = 0;
2365                         rc        = established_get_first(seq);
2366                 }
2367                 break;
2368         case TCP_SEQ_STATE_ESTABLISHED:
2369                 rc = established_get_next(seq, v);
2370                 break;
2371         }
2372 out:
2373         ++*pos;
2374         st->last_pos = *pos;
2375         return rc;
2376 }
2377 EXPORT_SYMBOL(tcp_seq_next);
2378
2379 void tcp_seq_stop(struct seq_file *seq, void *v)
2380 {
2381         struct tcp_iter_state *st = seq->private;
2382
2383         switch (st->state) {
2384         case TCP_SEQ_STATE_LISTENING:
2385                 if (v != SEQ_START_TOKEN)
2386                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2387                 break;
2388         case TCP_SEQ_STATE_ESTABLISHED:
2389                 if (v)
2390                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2391                 break;
2392         }
2393 }
2394 EXPORT_SYMBOL(tcp_seq_stop);
2395
2396 static void get_openreq4(const struct request_sock *req,
2397                          struct seq_file *f, int i)
2398 {
2399         const struct inet_request_sock *ireq = inet_rsk(req);
2400         long delta = req->rsk_timer.expires - jiffies;
2401
2402         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2403                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2404                 i,
2405                 ireq->ir_loc_addr,
2406                 ireq->ir_num,
2407                 ireq->ir_rmt_addr,
2408                 ntohs(ireq->ir_rmt_port),
2409                 TCP_SYN_RECV,
2410                 0, 0, /* could print option size, but that is af dependent. */
2411                 1,    /* timers active (only the expire timer) */
2412                 jiffies_delta_to_clock_t(delta),
2413                 req->num_timeout,
2414                 from_kuid_munged(seq_user_ns(f),
2415                                  sock_i_uid(req->rsk_listener)),
2416                 0,  /* non standard timer */
2417                 0, /* open_requests have no inode */
2418                 0,
2419                 req);
2420 }
2421
2422 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2423 {
2424         int timer_active;
2425         unsigned long timer_expires;
2426         const struct tcp_sock *tp = tcp_sk(sk);
2427         const struct inet_connection_sock *icsk = inet_csk(sk);
2428         const struct inet_sock *inet = inet_sk(sk);
2429         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2430         __be32 dest = inet->inet_daddr;
2431         __be32 src = inet->inet_rcv_saddr;
2432         __u16 destp = ntohs(inet->inet_dport);
2433         __u16 srcp = ntohs(inet->inet_sport);
2434         int rx_queue;
2435         int state;
2436
2437         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2438             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2439             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2440                 timer_active    = 1;
2441                 timer_expires   = icsk->icsk_timeout;
2442         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2443                 timer_active    = 4;
2444                 timer_expires   = icsk->icsk_timeout;
2445         } else if (timer_pending(&sk->sk_timer)) {
2446                 timer_active    = 2;
2447                 timer_expires   = sk->sk_timer.expires;
2448         } else {
2449                 timer_active    = 0;
2450                 timer_expires = jiffies;
2451         }
2452
2453         state = inet_sk_state_load(sk);
2454         if (state == TCP_LISTEN)
2455                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2456         else
2457                 /* Because we don't lock the socket,
2458                  * we might find a transient negative value.
2459                  */
2460                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2461                                       READ_ONCE(tp->copied_seq), 0);
2462
2463         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2464                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2465                 i, src, srcp, dest, destp, state,
2466                 READ_ONCE(tp->write_seq) - tp->snd_una,
2467                 rx_queue,
2468                 timer_active,
2469                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2470                 icsk->icsk_retransmits,
2471                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2472                 icsk->icsk_probes_out,
2473                 sock_i_ino(sk),
2474                 refcount_read(&sk->sk_refcnt), sk,
2475                 jiffies_to_clock_t(icsk->icsk_rto),
2476                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2477                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2478                 tp->snd_cwnd,
2479                 state == TCP_LISTEN ?
2480                     fastopenq->max_qlen :
2481                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2482 }
2483
2484 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2485                                struct seq_file *f, int i)
2486 {
2487         long delta = tw->tw_timer.expires - jiffies;
2488         __be32 dest, src;
2489         __u16 destp, srcp;
2490
2491         dest  = tw->tw_daddr;
2492         src   = tw->tw_rcv_saddr;
2493         destp = ntohs(tw->tw_dport);
2494         srcp  = ntohs(tw->tw_sport);
2495
2496         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2497                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2498                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2499                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2500                 refcount_read(&tw->tw_refcnt), tw);
2501 }
2502
2503 #define TMPSZ 150
2504
2505 static int tcp4_seq_show(struct seq_file *seq, void *v)
2506 {
2507         struct tcp_iter_state *st;
2508         struct sock *sk = v;
2509
2510         seq_setwidth(seq, TMPSZ - 1);
2511         if (v == SEQ_START_TOKEN) {
2512                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2513                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2514                            "inode");
2515                 goto out;
2516         }
2517         st = seq->private;
2518
2519         if (sk->sk_state == TCP_TIME_WAIT)
2520                 get_timewait4_sock(v, seq, st->num);
2521         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2522                 get_openreq4(v, seq, st->num);
2523         else
2524                 get_tcp4_sock(v, seq, st->num);
2525 out:
2526         seq_pad(seq, '\n');
2527         return 0;
2528 }
2529
2530 static const struct seq_operations tcp4_seq_ops = {
2531         .show           = tcp4_seq_show,
2532         .start          = tcp_seq_start,
2533         .next           = tcp_seq_next,
2534         .stop           = tcp_seq_stop,
2535 };
2536
2537 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2538         .family         = AF_INET,
2539 };
2540
2541 static int __net_init tcp4_proc_init_net(struct net *net)
2542 {
2543         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2544                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2545                 return -ENOMEM;
2546         return 0;
2547 }
2548
2549 static void __net_exit tcp4_proc_exit_net(struct net *net)
2550 {
2551         remove_proc_entry("tcp", net->proc_net);
2552 }
2553
2554 static struct pernet_operations tcp4_net_ops = {
2555         .init = tcp4_proc_init_net,
2556         .exit = tcp4_proc_exit_net,
2557 };
2558
2559 int __init tcp4_proc_init(void)
2560 {
2561         return register_pernet_subsys(&tcp4_net_ops);
2562 }
2563
2564 void tcp4_proc_exit(void)
2565 {
2566         unregister_pernet_subsys(&tcp4_net_ops);
2567 }
2568 #endif /* CONFIG_PROC_FS */
2569
2570 struct proto tcp_prot = {
2571         .name                   = "TCP",
2572         .owner                  = THIS_MODULE,
2573         .close                  = tcp_close,
2574         .pre_connect            = tcp_v4_pre_connect,
2575         .connect                = tcp_v4_connect,
2576         .disconnect             = tcp_disconnect,
2577         .accept                 = inet_csk_accept,
2578         .ioctl                  = tcp_ioctl,
2579         .init                   = tcp_v4_init_sock,
2580         .destroy                = tcp_v4_destroy_sock,
2581         .shutdown               = tcp_shutdown,
2582         .setsockopt             = tcp_setsockopt,
2583         .getsockopt             = tcp_getsockopt,
2584         .keepalive              = tcp_set_keepalive,
2585         .recvmsg                = tcp_recvmsg,
2586         .sendmsg                = tcp_sendmsg,
2587         .sendpage               = tcp_sendpage,
2588         .backlog_rcv            = tcp_v4_do_rcv,
2589         .release_cb             = tcp_release_cb,
2590         .hash                   = inet_hash,
2591         .unhash                 = inet_unhash,
2592         .get_port               = inet_csk_get_port,
2593         .enter_memory_pressure  = tcp_enter_memory_pressure,
2594         .leave_memory_pressure  = tcp_leave_memory_pressure,
2595         .stream_memory_free     = tcp_stream_memory_free,
2596         .sockets_allocated      = &tcp_sockets_allocated,
2597         .orphan_count           = &tcp_orphan_count,
2598         .memory_allocated       = &tcp_memory_allocated,
2599         .memory_pressure        = &tcp_memory_pressure,
2600         .sysctl_mem             = sysctl_tcp_mem,
2601         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2602         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2603         .max_header             = MAX_TCP_HEADER,
2604         .obj_size               = sizeof(struct tcp_sock),
2605         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2606         .twsk_prot              = &tcp_timewait_sock_ops,
2607         .rsk_prot               = &tcp_request_sock_ops,
2608         .h.hashinfo             = &tcp_hashinfo,
2609         .no_autobind            = true,
2610 #ifdef CONFIG_COMPAT
2611         .compat_setsockopt      = compat_tcp_setsockopt,
2612         .compat_getsockopt      = compat_tcp_getsockopt,
2613 #endif
2614         .diag_destroy           = tcp_abort,
2615 };
2616 EXPORT_SYMBOL(tcp_prot);
2617
2618 static void __net_exit tcp_sk_exit(struct net *net)
2619 {
2620         int cpu;
2621
2622         if (net->ipv4.tcp_congestion_control)
2623                 module_put(net->ipv4.tcp_congestion_control->owner);
2624
2625         for_each_possible_cpu(cpu)
2626                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2627         free_percpu(net->ipv4.tcp_sk);
2628 }
2629
2630 static int __net_init tcp_sk_init(struct net *net)
2631 {
2632         int res, cpu, cnt;
2633
2634         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2635         if (!net->ipv4.tcp_sk)
2636                 return -ENOMEM;
2637
2638         for_each_possible_cpu(cpu) {
2639                 struct sock *sk;
2640
2641                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2642                                            IPPROTO_TCP, net);
2643                 if (res)
2644                         goto fail;
2645                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2646
2647                 /* Please enforce IP_DF and IPID==0 for RST and
2648                  * ACK sent in SYN-RECV and TIME-WAIT state.
2649                  */
2650                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2651
2652                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2653         }
2654
2655         net->ipv4.sysctl_tcp_ecn = 2;
2656         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2657
2658         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2659         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2660         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2661         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2662         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2663
2664         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2665         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2666         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2667
2668         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2669         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2670         net->ipv4.sysctl_tcp_syncookies = 1;
2671         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2672         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2673         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2674         net->ipv4.sysctl_tcp_orphan_retries = 0;
2675         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2676         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2677         net->ipv4.sysctl_tcp_tw_reuse = 2;
2678
2679         cnt = tcp_hashinfo.ehash_mask + 1;
2680         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2681         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2682
2683         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2684         net->ipv4.sysctl_tcp_sack = 1;
2685         net->ipv4.sysctl_tcp_window_scaling = 1;
2686         net->ipv4.sysctl_tcp_timestamps = 1;
2687         net->ipv4.sysctl_tcp_early_retrans = 3;
2688         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2689         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2690         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2691         net->ipv4.sysctl_tcp_max_reordering = 300;
2692         net->ipv4.sysctl_tcp_dsack = 1;
2693         net->ipv4.sysctl_tcp_app_win = 31;
2694         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2695         net->ipv4.sysctl_tcp_frto = 2;
2696         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2697         /* This limits the percentage of the congestion window which we
2698          * will allow a single TSO frame to consume.  Building TSO frames
2699          * which are too large can cause TCP streams to be bursty.
2700          */
2701         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2702         /* Default TSQ limit of 16 TSO segments */
2703         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2704         /* rfc5961 challenge ack rate limiting */
2705         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2706         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2707         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2708         net->ipv4.sysctl_tcp_autocorking = 1;
2709         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2710         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2711         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2712         if (net != &init_net) {
2713                 memcpy(net->ipv4.sysctl_tcp_rmem,
2714                        init_net.ipv4.sysctl_tcp_rmem,
2715                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2716                 memcpy(net->ipv4.sysctl_tcp_wmem,
2717                        init_net.ipv4.sysctl_tcp_wmem,
2718                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2719         }
2720         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2721         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2722         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2723         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2724         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2725         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2726
2727         /* Reno is always built in */
2728         if (!net_eq(net, &init_net) &&
2729             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2730                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2731         else
2732                 net->ipv4.tcp_congestion_control = &tcp_reno;
2733
2734         return 0;
2735 fail:
2736         tcp_sk_exit(net);
2737
2738         return res;
2739 }
2740
2741 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2742 {
2743         struct net *net;
2744
2745         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2746
2747         list_for_each_entry(net, net_exit_list, exit_list)
2748                 tcp_fastopen_ctx_destroy(net);
2749 }
2750
2751 static struct pernet_operations __net_initdata tcp_sk_ops = {
2752        .init       = tcp_sk_init,
2753        .exit       = tcp_sk_exit,
2754        .exit_batch = tcp_sk_exit_batch,
2755 };
2756
2757 void __init tcp_v4_init(void)
2758 {
2759         if (register_pernet_subsys(&tcp_sk_ops))
2760                 panic("Failed to create the TCP control socket.\n");
2761 }