net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118         if (reuse == 2) {
 119                 /* Still does not detect *everything* that goes through
 120                  * lo, since we require a loopback src or dst address
 121                  * or direct binding to 'lo' interface.
 122                  */
 123                 bool loopback = false;
 124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                         loopback = true;
 126 #if IS_ENABLED(CONFIG_IPV6)
 127                 if (tw->tw_family == AF_INET6) {
 128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                 loopback = true;
 135                 } else
 136 #endif
 137                 {
 138                         if (ipv4_is_loopback(tw->tw_daddr) ||
 139                             ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                 loopback = true;
 141                 }
 142                 if (!loopback)
 143                         reuse = 0;
 144         }
 145
 146         /* With PAWS, it is safe from the viewpoint
 147            of data integrity. Even without PAWS it is safe provided sequence
 148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150            Actually, the idea is close to VJ's one, only timestamp cache is
 151            held not per host, but per port pair and TW bucket is used as state
 152            holder.
 153
 154            If TW bucket has been already destroyed we fall back to VJ's scheme
 155            and use initial timestamp retrieved from peer table.
 156          */
 157         if (tcptw->tw_ts_recent_stamp &&
 158             (!twp || (reuse && time_after32(ktime_get_seconds(),
 159                                             tcptw->tw_ts_recent_stamp)))) {
 160                 /* In case of repair and re-using TIME-WAIT sockets we still
 161                  * want to be sure that it is safe as above but honor the
 162                  * sequence numbers and time stamps set as part of the repair
 163                  * process.
 164                  *
 165                  * Without this check re-using a TIME-WAIT socket with TCP
 166                  * repair would accumulate a -1 on the repair assigned
 167                  * sequence number. The first time it is reused the sequence
 168                  * is -1, the second time -2, etc. This fixes that issue
 169                  * without appearing to create any others.
 170                  */
 171                 if (likely(!tp->repair)) {
 172                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 173                         if (tp->write_seq == 0)
 174                                 tp->write_seq = 1;
 175                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 176                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 177                 }
 178                 sock_hold(sktw);
 179                 return 1;
 180         }
 181
 182         return 0;
 183 }
 184 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 185
 186 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 187                               int addr_len)
 188 {
 189         /* This check is replicated from tcp_v4_connect() and intended to
 190          * prevent BPF program called below from accessing bytes that are out
 191          * of the bound specified by user in addr_len.
 192          */
 193         if (addr_len < sizeof(struct sockaddr_in))
 194                 return -EINVAL;
 195
 196         sock_owned_by_me(sk);
 197
 198         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 199 }
 200
 201 /* This will initiate an outgoing connection. */
 202 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 203 {
 204         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 205         struct inet_sock *inet = inet_sk(sk);
 206         struct tcp_sock *tp = tcp_sk(sk);
 207         __be16 orig_sport, orig_dport;
 208         __be32 daddr, nexthop;
 209         struct flowi4 *fl4;
 210         struct rtable *rt;
 211         int err;
 212         struct ip_options_rcu *inet_opt;
 213         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 214
 215         if (addr_len < sizeof(struct sockaddr_in))
 216                 return -EINVAL;
 217
 218         if (usin->sin_family != AF_INET)
 219                 return -EAFNOSUPPORT;
 220
 221         nexthop = daddr = usin->sin_addr.s_addr;
 222         inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                              lockdep_sock_is_held(sk));
 224         if (inet_opt && inet_opt->opt.srr) {
 225                 if (!daddr)
 226                         return -EINVAL;
 227                 nexthop = inet_opt->opt.faddr;
 228         }
 229
 230         orig_sport = inet->inet_sport;
 231         orig_dport = usin->sin_port;
 232         fl4 = &inet->cork.fl.u.ip4;
 233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 235                               IPPROTO_TCP,
 236                               orig_sport, orig_dport, sk);
 237         if (IS_ERR(rt)) {
 238                 err = PTR_ERR(rt);
 239                 if (err == -ENETUNREACH)
 240                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 241                 return err;
 242         }
 243
 244         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 245                 ip_rt_put(rt);
 246                 return -ENETUNREACH;
 247         }
 248
 249         if (!inet_opt || !inet_opt->opt.srr)
 250                 daddr = fl4->daddr;
 251
 252         if (!inet->inet_saddr)
 253                 inet->inet_saddr = fl4->saddr;
 254         sk_rcv_saddr_set(sk, inet->inet_saddr);
 255
 256         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 257                 /* Reset inherited state */
 258                 tp->rx_opt.ts_recent       = 0;
 259                 tp->rx_opt.ts_recent_stamp = 0;
 260                 if (likely(!tp->repair))
 261                         tp->write_seq      = 0;
 262         }
 263
 264         inet->inet_dport = usin->sin_port;
 265         sk_daddr_set(sk, daddr);
 266
 267         inet_csk(sk)->icsk_ext_hdr_len = 0;
 268         if (inet_opt)
 269                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 270
 271         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 272
 273         /* Socket identity is still unknown (sport may be zero).
 274          * However we set state to SYN-SENT and not releasing socket
 275          * lock select source port, enter ourselves into the hash tables and
 276          * complete initialization after this.
 277          */
 278         tcp_set_state(sk, TCP_SYN_SENT);
 279         err = inet_hash_connect(tcp_death_row, sk);
 280         if (err)
 281                 goto failure;
 282
 283         sk_set_txhash(sk);
 284
 285         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 286                                inet->inet_sport, inet->inet_dport, sk);
 287         if (IS_ERR(rt)) {
 288                 err = PTR_ERR(rt);
 289                 rt = NULL;
 290                 goto failure;
 291         }
 292         /* OK, now commit destination to socket.  */
 293         sk->sk_gso_type = SKB_GSO_TCPV4;
 294         sk_setup_caps(sk, &rt->dst);
 295         rt = NULL;
 296
 297         if (likely(!tp->repair)) {
 298                 if (!tp->write_seq)
 299                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 300                                                        inet->inet_daddr,
 301                                                        inet->inet_sport,
 302                                                        usin->sin_port);
 303                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 304                                                  inet->inet_saddr,
 305                                                  inet->inet_daddr);
 306         }
 307
 308         inet->inet_id = tp->write_seq ^ jiffies;
 309
 310         if (tcp_fastopen_defer_connect(sk, &err))
 311                 return err;
 312         if (err)
 313                 goto failure;
 314
 315         err = tcp_connect(sk);
 316
 317         if (err)
 318                 goto failure;
 319
 320         return 0;
 321
 322 failure:
 323         /*
 324          * This unhashes the socket and releases the local port,
 325          * if necessary.
 326          */
 327         tcp_set_state(sk, TCP_CLOSE);
 328         ip_rt_put(rt);
 329         sk->sk_route_caps = 0;
 330         inet->inet_dport = 0;
 331         return err;
 332 }
 333 EXPORT_SYMBOL(tcp_v4_connect);
 334
 335 /*
 336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 337  * It can be called through tcp_release_cb() if socket was owned by user
 338  * at the time tcp_v4_err() was called to handle ICMP message.
 339  */
 340 void tcp_v4_mtu_reduced(struct sock *sk)
 341 {
 342         struct inet_sock *inet = inet_sk(sk);
 343         struct dst_entry *dst;
 344         u32 mtu;
 345
 346         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 347                 return;
 348         mtu = tcp_sk(sk)->mtu_info;
 349         dst = inet_csk_update_pmtu(sk, mtu);
 350         if (!dst)
 351                 return;
 352
 353         /* Something is about to be wrong... Remember soft error
 354          * for the case, if this connection will not able to recover.
 355          */
 356         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 357                 sk->sk_err_soft = EMSGSIZE;
 358
 359         mtu = dst_mtu(dst);
 360
 361         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 362             ip_sk_accept_pmtu(sk) &&
 363             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 364                 tcp_sync_mss(sk, mtu);
 365
 366                 /* Resend the TCP packet because it's
 367                  * clear that the old packet has been
 368                  * dropped. This is the new "fast" path mtu
 369                  * discovery.
 370                  */
 371                 tcp_simple_retransmit(sk);
 372         } /* else let the usual retransmit timer handle it */
 373 }
 374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 375
 376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 377 {
 378         struct dst_entry *dst = __sk_dst_check(sk, 0);
 379
 380         if (dst)
 381                 dst->ops->redirect(dst, sk, skb);
 382 }
 383
 384
 385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 387 {
 388         struct request_sock *req = inet_reqsk(sk);
 389         struct net *net = sock_net(sk);
 390
 391         /* ICMPs are not backlogged, hence we cannot get
 392          * an established socket here.
 393          */
 394         if (seq != tcp_rsk(req)->snt_isn) {
 395                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 396         } else if (abort) {
 397                 /*
 398                  * Still in SYN_RECV, just remove it silently.
 399                  * There is no good way to pass the error to the newly
 400                  * created socket, and POSIX does not want network
 401                  * errors returned from accept().
 402                  */
 403                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 404                 tcp_listendrop(req->rsk_listener);
 405         }
 406         reqsk_put(req);
 407 }
 408 EXPORT_SYMBOL(tcp_req_err);
 409
 410 /*
 411  * This routine is called by the ICMP module when it gets some
 412  * sort of error condition.  If err < 0 then the socket should
 413  * be closed and the error returned to the user.  If err > 0
 414  * it's just the icmp type << 8 | icmp code.  After adjustment
 415  * header points to the first 8 bytes of the tcp header.  We need
 416  * to find the appropriate port.
 417  *
 418  * The locking strategy used here is very "optimistic". When
 419  * someone else accesses the socket the ICMP is just dropped
 420  * and for some paths there is no check at all.
 421  * A more general error queue to queue errors for later handling
 422  * is probably better.
 423  *
 424  */
 425
 426 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 427 {
 428         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 429         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 430         struct inet_connection_sock *icsk;
 431         struct tcp_sock *tp;
 432         struct inet_sock *inet;
 433         const int type = icmp_hdr(icmp_skb)->type;
 434         const int code = icmp_hdr(icmp_skb)->code;
 435         struct sock *sk;
 436         struct sk_buff *skb;
 437         struct request_sock *fastopen;
 438         u32 seq, snd_una;
 439         s32 remaining;
 440         u32 delta_us;
 441         int err;
 442         struct net *net = dev_net(icmp_skb->dev);
 443
 444         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 445                                        th->dest, iph->saddr, ntohs(th->source),
 446                                        inet_iif(icmp_skb), 0);
 447         if (!sk) {
 448                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 449                 return -ENOENT;
 450         }
 451         if (sk->sk_state == TCP_TIME_WAIT) {
 452                 inet_twsk_put(inet_twsk(sk));
 453                 return 0;
 454         }
 455         seq = ntohl(th->seq);
 456         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 457                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 458                                      type == ICMP_TIME_EXCEEDED ||
 459                                      (type == ICMP_DEST_UNREACH &&
 460                                       (code == ICMP_NET_UNREACH ||
 461                                        code == ICMP_HOST_UNREACH)));
 462                 return 0;
 463         }
 464
 465         bh_lock_sock(sk);
 466         /* If too many ICMPs get dropped on busy
 467          * servers this needs to be solved differently.
 468          * We do take care of PMTU discovery (RFC1191) special case :
 469          * we can receive locally generated ICMP messages while socket is held.
 470          */
 471         if (sock_owned_by_user(sk)) {
 472                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 473                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 474         }
 475         if (sk->sk_state == TCP_CLOSE)
 476                 goto out;
 477
 478         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 479                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 480                 goto out;
 481         }
 482
 483         icsk = inet_csk(sk);
 484         tp = tcp_sk(sk);
 485         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 486         fastopen = tp->fastopen_rsk;
 487         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 488         if (sk->sk_state != TCP_LISTEN &&
 489             !between(seq, snd_una, tp->snd_nxt)) {
 490                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 491                 goto out;
 492         }
 493
 494         switch (type) {
 495         case ICMP_REDIRECT:
 496                 if (!sock_owned_by_user(sk))
 497                         do_redirect(icmp_skb, sk);
 498                 goto out;
 499         case ICMP_SOURCE_QUENCH:
 500                 /* Just silently ignore these. */
 501                 goto out;
 502         case ICMP_PARAMETERPROB:
 503                 err = EPROTO;
 504                 break;
 505         case ICMP_DEST_UNREACH:
 506                 if (code > NR_ICMP_UNREACH)
 507                         goto out;
 508
 509                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 510                         /* We are not interested in TCP_LISTEN and open_requests
 511                          * (SYN-ACKs send out by Linux are always <576bytes so
 512                          * they should go through unfragmented).
 513                          */
 514                         if (sk->sk_state == TCP_LISTEN)
 515                                 goto out;
 516
 517                         tp->mtu_info = info;
 518                         if (!sock_owned_by_user(sk)) {
 519                                 tcp_v4_mtu_reduced(sk);
 520                         } else {
 521                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 522                                         sock_hold(sk);
 523                         }
 524                         goto out;
 525                 }
 526
 527                 err = icmp_err_convert[code].errno;
 528                 /* check if icmp_skb allows revert of backoff
 529                  * (see draft-zimmermann-tcp-lcd) */
 530                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 531                         break;
 532                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 533                     !icsk->icsk_backoff || fastopen)
 534                         break;
 535
 536                 if (sock_owned_by_user(sk))
 537                         break;
 538
 539                 icsk->icsk_backoff--;
 540                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 541                                                TCP_TIMEOUT_INIT;
 542                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 543
 544                 skb = tcp_rtx_queue_head(sk);
 545
 546                 tcp_mstamp_refresh(tp);
 547                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 548                 remaining = icsk->icsk_rto -
 549                             usecs_to_jiffies(delta_us);
 550
 551                 if (remaining > 0) {
 552                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 553                                                   remaining, TCP_RTO_MAX);
 554                 } else {
 555                         /* RTO revert clocked out retransmission.
 556                          * Will retransmit now */
 557                         tcp_retransmit_timer(sk);
 558                 }
 559
 560                 break;
 561         case ICMP_TIME_EXCEEDED:
 562                 err = EHOSTUNREACH;
 563                 break;
 564         default:
 565                 goto out;
 566         }
 567
 568         switch (sk->sk_state) {
 569         case TCP_SYN_SENT:
 570         case TCP_SYN_RECV:
 571                 /* Only in fast or simultaneous open. If a fast open socket is
 572                  * is already accepted it is treated as a connected one below.
 573                  */
 574                 if (fastopen && !fastopen->sk)
 575                         break;
 576
 577                 if (!sock_owned_by_user(sk)) {
 578                         sk->sk_err = err;
 579
 580                         sk->sk_error_report(sk);
 581
 582                         tcp_done(sk);
 583                 } else {
 584                         sk->sk_err_soft = err;
 585                 }
 586                 goto out;
 587         }
 588
 589         /* If we've already connected we will keep trying
 590          * until we time out, or the user gives up.
 591          *
 592          * rfc1122 4.2.3.9 allows to consider as hard errors
 593          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 594          * but it is obsoleted by pmtu discovery).
 595          *
 596          * Note, that in modern internet, where routing is unreliable
 597          * and in each dark corner broken firewalls sit, sending random
 598          * errors ordered by their masters even this two messages finally lose
 599          * their original sense (even Linux sends invalid PORT_UNREACHs)
 600          *
 601          * Now we are in compliance with RFCs.
 602          *                                                      --ANK (980905)
 603          */
 604
 605         inet = inet_sk(sk);
 606         if (!sock_owned_by_user(sk) && inet->recverr) {
 607                 sk->sk_err = err;
 608                 sk->sk_error_report(sk);
 609         } else  { /* Only an error on timeout */
 610                 sk->sk_err_soft = err;
 611         }
 612
 613 out:
 614         bh_unlock_sock(sk);
 615         sock_put(sk);
 616         return 0;
 617 }
 618
 619 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 620 {
 621         struct tcphdr *th = tcp_hdr(skb);
 622
 623         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 624         skb->csum_start = skb_transport_header(skb) - skb->head;
 625         skb->csum_offset = offsetof(struct tcphdr, check);
 626 }
 627
 628 /* This routine computes an IPv4 TCP checksum. */
 629 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 630 {
 631         const struct inet_sock *inet = inet_sk(sk);
 632
 633         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 634 }
 635 EXPORT_SYMBOL(tcp_v4_send_check);
 636
 637 /*
 638  *      This routine will send an RST to the other tcp.
 639  *
 640  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 641  *                    for reset.
 642  *      Answer: if a packet caused RST, it is not for a socket
 643  *              existing in our system, if it is matched to a socket,
 644  *              it is just duplicate segment or bug in other side's TCP.
 645  *              So that we build reply only basing on parameters
 646  *              arrived with segment.
 647  *      Exception: precedence violation. We do not implement it in any case.
 648  */
 649
 650 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 651 {
 652         const struct tcphdr *th = tcp_hdr(skb);
 653         struct {
 654                 struct tcphdr th;
 655 #ifdef CONFIG_TCP_MD5SIG
 656                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 657 #endif
 658         } rep;
 659         struct ip_reply_arg arg;
 660 #ifdef CONFIG_TCP_MD5SIG
 661         struct tcp_md5sig_key *key = NULL;
 662         const __u8 *hash_location = NULL;
 663         unsigned char newhash[16];
 664         int genhash;
 665         struct sock *sk1 = NULL;
 666 #endif
 667         struct net *net;
 668         struct sock *ctl_sk;
 669
 670         /* Never send a reset in response to a reset. */
 671         if (th->rst)
 672                 return;
 673
 674         /* If sk not NULL, it means we did a successful lookup and incoming
 675          * route had to be correct. prequeue might have dropped our dst.
 676          */
 677         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 678                 return;
 679
 680         /* Swap the send and the receive. */
 681         memset(&rep, 0, sizeof(rep));
 682         rep.th.dest   = th->source;
 683         rep.th.source = th->dest;
 684         rep.th.doff   = sizeof(struct tcphdr) / 4;
 685         rep.th.rst    = 1;
 686
 687         if (th->ack) {
 688                 rep.th.seq = th->ack_seq;
 689         } else {
 690                 rep.th.ack = 1;
 691                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 692                                        skb->len - (th->doff << 2));
 693         }
 694
 695         memset(&arg, 0, sizeof(arg));
 696         arg.iov[0].iov_base = (unsigned char *)&rep;
 697         arg.iov[0].iov_len  = sizeof(rep.th);
 698
 699         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 700 #ifdef CONFIG_TCP_MD5SIG
 701         rcu_read_lock();
 702         hash_location = tcp_parse_md5sig_option(th);
 703         if (sk && sk_fullsock(sk)) {
 704                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 705                                         &ip_hdr(skb)->saddr, AF_INET);
 706         } else if (hash_location) {
 707                 /*
 708                  * active side is lost. Try to find listening socket through
 709                  * source port, and then find md5 key through listening socket.
 710                  * we are not loose security here:
 711                  * Incoming packet is checked with md5 hash with finding key,
 712                  * no RST generated if md5 hash doesn't match.
 713                  */
 714                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 715                                              ip_hdr(skb)->saddr,
 716                                              th->source, ip_hdr(skb)->daddr,
 717                                              ntohs(th->source), inet_iif(skb),
 718                                              tcp_v4_sdif(skb));
 719                 /* don't send rst if it can't find key */
 720                 if (!sk1)
 721                         goto out;
 722
 723                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 724                                         &ip_hdr(skb)->saddr, AF_INET);
 725                 if (!key)
 726                         goto out;
 727
 728
 729                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 730                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 731                         goto out;
 732
 733         }
 734
 735         if (key) {
 736                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 737                                    (TCPOPT_NOP << 16) |
 738                                    (TCPOPT_MD5SIG << 8) |
 739                                    TCPOLEN_MD5SIG);
 740                 /* Update length and the length the header thinks exists */
 741                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 742                 rep.th.doff = arg.iov[0].iov_len / 4;
 743
 744                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 745                                      key, ip_hdr(skb)->saddr,
 746                                      ip_hdr(skb)->daddr, &rep.th);
 747         }
 748 #endif
 749         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 750                                       ip_hdr(skb)->saddr, /* XXX */
 751                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 752         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 753         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 754
 755         /* When socket is gone, all binding information is lost.
 756          * routing might fail in this case. No choice here, if we choose to force
 757          * input interface, we will misroute in case of asymmetric route.
 758          */
 759         if (sk) {
 760                 arg.bound_dev_if = sk->sk_bound_dev_if;
 761                 if (sk_fullsock(sk))
 762                         trace_tcp_send_reset(sk, skb);
 763         }
 764
 765         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 766                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 767
 768         arg.tos = ip_hdr(skb)->tos;
 769         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 770         local_bh_disable();
 771         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 772         if (sk)
 773                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 774                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 775         ip_send_unicast_reply(ctl_sk,
 776                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 777                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 778                               &arg, arg.iov[0].iov_len);
 779
 780         ctl_sk->sk_mark = 0;
 781         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 782         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 783         local_bh_enable();
 784
 785 #ifdef CONFIG_TCP_MD5SIG
 786 out:
 787         rcu_read_unlock();
 788 #endif
 789 }
 790
 791 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 792    outside socket context is ugly, certainly. What can I do?
 793  */
 794
 795 static void tcp_v4_send_ack(const struct sock *sk,
 796                             struct sk_buff *skb, u32 seq, u32 ack,
 797                             u32 win, u32 tsval, u32 tsecr, int oif,
 798                             struct tcp_md5sig_key *key,
 799                             int reply_flags, u8 tos)
 800 {
 801         const struct tcphdr *th = tcp_hdr(skb);
 802         struct {
 803                 struct tcphdr th;
 804                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 805 #ifdef CONFIG_TCP_MD5SIG
 806                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 807 #endif
 808                         ];
 809         } rep;
 810         struct net *net = sock_net(sk);
 811         struct ip_reply_arg arg;
 812         struct sock *ctl_sk;
 813
 814         memset(&rep.th, 0, sizeof(struct tcphdr));
 815         memset(&arg, 0, sizeof(arg));
 816
 817         arg.iov[0].iov_base = (unsigned char *)&rep;
 818         arg.iov[0].iov_len  = sizeof(rep.th);
 819         if (tsecr) {
 820                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 821                                    (TCPOPT_TIMESTAMP << 8) |
 822                                    TCPOLEN_TIMESTAMP);
 823                 rep.opt[1] = htonl(tsval);
 824                 rep.opt[2] = htonl(tsecr);
 825                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 826         }
 827
 828         /* Swap the send and the receive. */
 829         rep.th.dest    = th->source;
 830         rep.th.source  = th->dest;
 831         rep.th.doff    = arg.iov[0].iov_len / 4;
 832         rep.th.seq     = htonl(seq);
 833         rep.th.ack_seq = htonl(ack);
 834         rep.th.ack     = 1;
 835         rep.th.window  = htons(win);
 836
 837 #ifdef CONFIG_TCP_MD5SIG
 838         if (key) {
 839                 int offset = (tsecr) ? 3 : 0;
 840
 841                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 842                                           (TCPOPT_NOP << 16) |
 843                                           (TCPOPT_MD5SIG << 8) |
 844                                           TCPOLEN_MD5SIG);
 845                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 846                 rep.th.doff = arg.iov[0].iov_len/4;
 847
 848                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 849                                     key, ip_hdr(skb)->saddr,
 850                                     ip_hdr(skb)->daddr, &rep.th);
 851         }
 852 #endif
 853         arg.flags = reply_flags;
 854         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 855                                       ip_hdr(skb)->saddr, /* XXX */
 856                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 857         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 858         if (oif)
 859                 arg.bound_dev_if = oif;
 860         arg.tos = tos;
 861         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 862         local_bh_disable();
 863         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 864         if (sk)
 865                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 866                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 867         ip_send_unicast_reply(ctl_sk,
 868                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 869                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 870                               &arg, arg.iov[0].iov_len);
 871
 872         ctl_sk->sk_mark = 0;
 873         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 874         local_bh_enable();
 875 }
 876
 877 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 878 {
 879         struct inet_timewait_sock *tw = inet_twsk(sk);
 880         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 881
 882         tcp_v4_send_ack(sk, skb,
 883                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 884                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 885                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 886                         tcptw->tw_ts_recent,
 887                         tw->tw_bound_dev_if,
 888                         tcp_twsk_md5_key(tcptw),
 889                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 890                         tw->tw_tos
 891                         );
 892
 893         inet_twsk_put(tw);
 894 }
 895
 896 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 897                                   struct request_sock *req)
 898 {
 899         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 900          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 901          */
 902         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 903                                              tcp_sk(sk)->snd_nxt;
 904
 905         /* RFC 7323 2.3
 906          * The window field (SEG.WND) of every outgoing segment, with the
 907          * exception of <SYN> segments, MUST be right-shifted by
 908          * Rcv.Wind.Shift bits:
 909          */
 910         tcp_v4_send_ack(sk, skb, seq,
 911                         tcp_rsk(req)->rcv_nxt,
 912                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 913                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 914                         req->ts_recent,
 915                         0,
 916                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 917                                           AF_INET),
 918                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 919                         ip_hdr(skb)->tos);
 920 }
 921
 922 /*
 923  *      Send a SYN-ACK after having received a SYN.
 924  *      This still operates on a request_sock only, not on a big
 925  *      socket.
 926  */
 927 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 928                               struct flowi *fl,
 929                               struct request_sock *req,
 930                               struct tcp_fastopen_cookie *foc,
 931                               enum tcp_synack_type synack_type)
 932 {
 933         const struct inet_request_sock *ireq = inet_rsk(req);
 934         struct flowi4 fl4;
 935         int err = -1;
 936         struct sk_buff *skb;
 937
 938         /* First, grab a route. */
 939         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 940                 return -1;
 941
 942         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 943
 944         if (skb) {
 945                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 946
 947                 rcu_read_lock();
 948                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 949                                             ireq->ir_rmt_addr,
 950                                             rcu_dereference(ireq->ireq_opt));
 951                 rcu_read_unlock();
 952                 err = net_xmit_eval(err);
 953         }
 954
 955         return err;
 956 }
 957
 958 /*
 959  *      IPv4 request_sock destructor.
 960  */
 961 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 962 {
 963         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 964 }
 965
 966 #ifdef CONFIG_TCP_MD5SIG
 967 /*
 968  * RFC2385 MD5 checksumming requires a mapping of
 969  * IP address->MD5 Key.
 970  * We need to maintain these in the sk structure.
 971  */
 972
 973 struct static_key tcp_md5_needed __read_mostly;
 974 EXPORT_SYMBOL(tcp_md5_needed);
 975
 976 /* Find the Key structure for an address.  */
 977 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 978                                            const union tcp_md5_addr *addr,
 979                                            int family)
 980 {
 981         const struct tcp_sock *tp = tcp_sk(sk);
 982         struct tcp_md5sig_key *key;
 983         const struct tcp_md5sig_info *md5sig;
 984         __be32 mask;
 985         struct tcp_md5sig_key *best_match = NULL;
 986         bool match;
 987
 988         /* caller either holds rcu_read_lock() or socket lock */
 989         md5sig = rcu_dereference_check(tp->md5sig_info,
 990                                        lockdep_sock_is_held(sk));
 991         if (!md5sig)
 992                 return NULL;
 993
 994         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 995                 if (key->family != family)
 996                         continue;
 997
 998                 if (family == AF_INET) {
 999                         mask = inet_make_mask(key->prefixlen);
1000                         match = (key->addr.a4.s_addr & mask) ==
1001                                 (addr->a4.s_addr & mask);
1002 #if IS_ENABLED(CONFIG_IPV6)
1003                 } else if (family == AF_INET6) {
1004                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1005                                                   key->prefixlen);
1006 #endif
1007                 } else {
1008                         match = false;
1009                 }
1010
1011                 if (match && (!best_match ||
1012                               key->prefixlen > best_match->prefixlen))
1013                         best_match = key;
1014         }
1015         return best_match;
1016 }
1017 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1018
1019 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1020                                                       const union tcp_md5_addr *addr,
1021                                                       int family, u8 prefixlen)
1022 {
1023         const struct tcp_sock *tp = tcp_sk(sk);
1024         struct tcp_md5sig_key *key;
1025         unsigned int size = sizeof(struct in_addr);
1026         const struct tcp_md5sig_info *md5sig;
1027
1028         /* caller either holds rcu_read_lock() or socket lock */
1029         md5sig = rcu_dereference_check(tp->md5sig_info,
1030                                        lockdep_sock_is_held(sk));
1031         if (!md5sig)
1032                 return NULL;
1033 #if IS_ENABLED(CONFIG_IPV6)
1034         if (family == AF_INET6)
1035                 size = sizeof(struct in6_addr);
1036 #endif
1037         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1038                 if (key->family != family)
1039                         continue;
1040                 if (!memcmp(&key->addr, addr, size) &&
1041                     key->prefixlen == prefixlen)
1042                         return key;
1043         }
1044         return NULL;
1045 }
1046
1047 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1048                                          const struct sock *addr_sk)
1049 {
1050         const union tcp_md5_addr *addr;
1051
1052         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1053         return tcp_md5_do_lookup(sk, addr, AF_INET);
1054 }
1055 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1056
1057 /* This can be called on a newly created socket, from other files */
1058 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1059                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1060                    gfp_t gfp)
1061 {
1062         /* Add Key to the list */
1063         struct tcp_md5sig_key *key;
1064         struct tcp_sock *tp = tcp_sk(sk);
1065         struct tcp_md5sig_info *md5sig;
1066
1067         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1068         if (key) {
1069                 /* Pre-existing entry - just update that one. */
1070                 memcpy(key->key, newkey, newkeylen);
1071                 key->keylen = newkeylen;
1072                 return 0;
1073         }
1074
1075         md5sig = rcu_dereference_protected(tp->md5sig_info,
1076                                            lockdep_sock_is_held(sk));
1077         if (!md5sig) {
1078                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1079                 if (!md5sig)
1080                         return -ENOMEM;
1081
1082                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1083                 INIT_HLIST_HEAD(&md5sig->head);
1084                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1085         }
1086
1087         key = sock_kmalloc(sk, sizeof(*key), gfp);
1088         if (!key)
1089                 return -ENOMEM;
1090         if (!tcp_alloc_md5sig_pool()) {
1091                 sock_kfree_s(sk, key, sizeof(*key));
1092                 return -ENOMEM;
1093         }
1094
1095         memcpy(key->key, newkey, newkeylen);
1096         key->keylen = newkeylen;
1097         key->family = family;
1098         key->prefixlen = prefixlen;
1099         memcpy(&key->addr, addr,
1100                (family == AF_INET6) ? sizeof(struct in6_addr) :
1101                                       sizeof(struct in_addr));
1102         hlist_add_head_rcu(&key->node, &md5sig->head);
1103         return 0;
1104 }
1105 EXPORT_SYMBOL(tcp_md5_do_add);
1106
1107 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1108                    u8 prefixlen)
1109 {
1110         struct tcp_md5sig_key *key;
1111
1112         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1113         if (!key)
1114                 return -ENOENT;
1115         hlist_del_rcu(&key->node);
1116         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1117         kfree_rcu(key, rcu);
1118         return 0;
1119 }
1120 EXPORT_SYMBOL(tcp_md5_do_del);
1121
1122 static void tcp_clear_md5_list(struct sock *sk)
1123 {
1124         struct tcp_sock *tp = tcp_sk(sk);
1125         struct tcp_md5sig_key *key;
1126         struct hlist_node *n;
1127         struct tcp_md5sig_info *md5sig;
1128
1129         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1130
1131         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1132                 hlist_del_rcu(&key->node);
1133                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1134                 kfree_rcu(key, rcu);
1135         }
1136 }
1137
1138 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1139                                  char __user *optval, int optlen)
1140 {
1141         struct tcp_md5sig cmd;
1142         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1143         u8 prefixlen = 32;
1144
1145         if (optlen < sizeof(cmd))
1146                 return -EINVAL;
1147
1148         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1149                 return -EFAULT;
1150
1151         if (sin->sin_family != AF_INET)
1152                 return -EINVAL;
1153
1154         if (optname == TCP_MD5SIG_EXT &&
1155             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1156                 prefixlen = cmd.tcpm_prefixlen;
1157                 if (prefixlen > 32)
1158                         return -EINVAL;
1159         }
1160
1161         if (!cmd.tcpm_keylen)
1162                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1163                                       AF_INET, prefixlen);
1164
1165         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1166                 return -EINVAL;
1167
1168         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1169                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1170                               GFP_KERNEL);
1171 }
1172
1173 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1174                                    __be32 daddr, __be32 saddr,
1175                                    const struct tcphdr *th, int nbytes)
1176 {
1177         struct tcp4_pseudohdr *bp;
1178         struct scatterlist sg;
1179         struct tcphdr *_th;
1180
1181         bp = hp->scratch;
1182         bp->saddr = saddr;
1183         bp->daddr = daddr;
1184         bp->pad = 0;
1185         bp->protocol = IPPROTO_TCP;
1186         bp->len = cpu_to_be16(nbytes);
1187
1188         _th = (struct tcphdr *)(bp + 1);
1189         memcpy(_th, th, sizeof(*th));
1190         _th->check = 0;
1191
1192         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1193         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1194                                 sizeof(*bp) + sizeof(*th));
1195         return crypto_ahash_update(hp->md5_req);
1196 }
1197
1198 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1199                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1200 {
1201         struct tcp_md5sig_pool *hp;
1202         struct ahash_request *req;
1203
1204         hp = tcp_get_md5sig_pool();
1205         if (!hp)
1206                 goto clear_hash_noput;
1207         req = hp->md5_req;
1208
1209         if (crypto_ahash_init(req))
1210                 goto clear_hash;
1211         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1212                 goto clear_hash;
1213         if (tcp_md5_hash_key(hp, key))
1214                 goto clear_hash;
1215         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1216         if (crypto_ahash_final(req))
1217                 goto clear_hash;
1218
1219         tcp_put_md5sig_pool();
1220         return 0;
1221
1222 clear_hash:
1223         tcp_put_md5sig_pool();
1224 clear_hash_noput:
1225         memset(md5_hash, 0, 16);
1226         return 1;
1227 }
1228
1229 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1230                         const struct sock *sk,
1231                         const struct sk_buff *skb)
1232 {
1233         struct tcp_md5sig_pool *hp;
1234         struct ahash_request *req;
1235         const struct tcphdr *th = tcp_hdr(skb);
1236         __be32 saddr, daddr;
1237
1238         if (sk) { /* valid for establish/request sockets */
1239                 saddr = sk->sk_rcv_saddr;
1240                 daddr = sk->sk_daddr;
1241         } else {
1242                 const struct iphdr *iph = ip_hdr(skb);
1243                 saddr = iph->saddr;
1244                 daddr = iph->daddr;
1245         }
1246
1247         hp = tcp_get_md5sig_pool();
1248         if (!hp)
1249                 goto clear_hash_noput;
1250         req = hp->md5_req;
1251
1252         if (crypto_ahash_init(req))
1253                 goto clear_hash;
1254
1255         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1256                 goto clear_hash;
1257         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1258                 goto clear_hash;
1259         if (tcp_md5_hash_key(hp, key))
1260                 goto clear_hash;
1261         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1262         if (crypto_ahash_final(req))
1263                 goto clear_hash;
1264
1265         tcp_put_md5sig_pool();
1266         return 0;
1267
1268 clear_hash:
1269         tcp_put_md5sig_pool();
1270 clear_hash_noput:
1271         memset(md5_hash, 0, 16);
1272         return 1;
1273 }
1274 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1275
1276 #endif
1277
1278 /* Called with rcu_read_lock() */
1279 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1280                                     const struct sk_buff *skb)
1281 {
1282 #ifdef CONFIG_TCP_MD5SIG
1283         /*
1284          * This gets called for each TCP segment that arrives
1285          * so we want to be efficient.
1286          * We have 3 drop cases:
1287          * o No MD5 hash and one expected.
1288          * o MD5 hash and we're not expecting one.
1289          * o MD5 hash and its wrong.
1290          */
1291         const __u8 *hash_location = NULL;
1292         struct tcp_md5sig_key *hash_expected;
1293         const struct iphdr *iph = ip_hdr(skb);
1294         const struct tcphdr *th = tcp_hdr(skb);
1295         int genhash;
1296         unsigned char newhash[16];
1297
1298         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1299                                           AF_INET);
1300         hash_location = tcp_parse_md5sig_option(th);
1301
1302         /* We've parsed the options - do we have a hash? */
1303         if (!hash_expected && !hash_location)
1304                 return false;
1305
1306         if (hash_expected && !hash_location) {
1307                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1308                 return true;
1309         }
1310
1311         if (!hash_expected && hash_location) {
1312                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1313                 return true;
1314         }
1315
1316         /* Okay, so this is hash_expected and hash_location -
1317          * so we need to calculate the checksum.
1318          */
1319         genhash = tcp_v4_md5_hash_skb(newhash,
1320                                       hash_expected,
1321                                       NULL, skb);
1322
1323         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1324                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1325                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1326                                      &iph->saddr, ntohs(th->source),
1327                                      &iph->daddr, ntohs(th->dest),
1328                                      genhash ? " tcp_v4_calc_md5_hash failed"
1329                                      : "");
1330                 return true;
1331         }
1332         return false;
1333 #endif
1334         return false;
1335 }
1336
1337 static void tcp_v4_init_req(struct request_sock *req,
1338                             const struct sock *sk_listener,
1339                             struct sk_buff *skb)
1340 {
1341         struct inet_request_sock *ireq = inet_rsk(req);
1342         struct net *net = sock_net(sk_listener);
1343
1344         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1345         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1346         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1347 }
1348
1349 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1350                                           struct flowi *fl,
1351                                           const struct request_sock *req)
1352 {
1353         return inet_csk_route_req(sk, &fl->u.ip4, req);
1354 }
1355
1356 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1357         .family         =       PF_INET,
1358         .obj_size       =       sizeof(struct tcp_request_sock),
1359         .rtx_syn_ack    =       tcp_rtx_synack,
1360         .send_ack       =       tcp_v4_reqsk_send_ack,
1361         .destructor     =       tcp_v4_reqsk_destructor,
1362         .send_reset     =       tcp_v4_send_reset,
1363         .syn_ack_timeout =      tcp_syn_ack_timeout,
1364 };
1365
1366 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1367         .mss_clamp      =       TCP_MSS_DEFAULT,
1368 #ifdef CONFIG_TCP_MD5SIG
1369         .req_md5_lookup =       tcp_v4_md5_lookup,
1370         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1371 #endif
1372         .init_req       =       tcp_v4_init_req,
1373 #ifdef CONFIG_SYN_COOKIES
1374         .cookie_init_seq =      cookie_v4_init_sequence,
1375 #endif
1376         .route_req      =       tcp_v4_route_req,
1377         .init_seq       =       tcp_v4_init_seq,
1378         .init_ts_off    =       tcp_v4_init_ts_off,
1379         .send_synack    =       tcp_v4_send_synack,
1380 };
1381
1382 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1383 {
1384         /* Never answer to SYNs send to broadcast or multicast */
1385         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1386                 goto drop;
1387
1388         return tcp_conn_request(&tcp_request_sock_ops,
1389                                 &tcp_request_sock_ipv4_ops, sk, skb);
1390
1391 drop:
1392         tcp_listendrop(sk);
1393         return 0;
1394 }
1395 EXPORT_SYMBOL(tcp_v4_conn_request);
1396
1397
1398 /*
1399  * The three way handshake has completed - we got a valid synack -
1400  * now create the new socket.
1401  */
1402 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1403                                   struct request_sock *req,
1404                                   struct dst_entry *dst,
1405                                   struct request_sock *req_unhash,
1406                                   bool *own_req)
1407 {
1408         struct inet_request_sock *ireq;
1409         struct inet_sock *newinet;
1410         struct tcp_sock *newtp;
1411         struct sock *newsk;
1412 #ifdef CONFIG_TCP_MD5SIG
1413         struct tcp_md5sig_key *key;
1414 #endif
1415         struct ip_options_rcu *inet_opt;
1416
1417         if (sk_acceptq_is_full(sk))
1418                 goto exit_overflow;
1419
1420         newsk = tcp_create_openreq_child(sk, req, skb);
1421         if (!newsk)
1422                 goto exit_nonewsk;
1423
1424         newsk->sk_gso_type = SKB_GSO_TCPV4;
1425         inet_sk_rx_dst_set(newsk, skb);
1426
1427         newtp                 = tcp_sk(newsk);
1428         newinet               = inet_sk(newsk);
1429         ireq                  = inet_rsk(req);
1430         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1431         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1432         newsk->sk_bound_dev_if = ireq->ir_iif;
1433         newinet->inet_saddr   = ireq->ir_loc_addr;
1434         inet_opt              = rcu_dereference(ireq->ireq_opt);
1435         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1436         newinet->mc_index     = inet_iif(skb);
1437         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1438         newinet->rcv_tos      = ip_hdr(skb)->tos;
1439         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1440         if (inet_opt)
1441                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1442         newinet->inet_id = newtp->write_seq ^ jiffies;
1443
1444         if (!dst) {
1445                 dst = inet_csk_route_child_sock(sk, newsk, req);
1446                 if (!dst)
1447                         goto put_and_exit;
1448         } else {
1449                 /* syncookie case : see end of cookie_v4_check() */
1450         }
1451         sk_setup_caps(newsk, dst);
1452
1453         tcp_ca_openreq_child(newsk, dst);
1454
1455         tcp_sync_mss(newsk, dst_mtu(dst));
1456         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1457
1458         tcp_initialize_rcv_mss(newsk);
1459
1460 #ifdef CONFIG_TCP_MD5SIG
1461         /* Copy over the MD5 key from the original socket */
1462         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1463                                 AF_INET);
1464         if (key) {
1465                 /*
1466                  * We're using one, so create a matching key
1467                  * on the newsk structure. If we fail to get
1468                  * memory, then we end up not copying the key
1469                  * across. Shucks.
1470                  */
1471                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1473                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1474         }
1475 #endif
1476
1477         if (__inet_inherit_port(sk, newsk) < 0)
1478                 goto put_and_exit;
1479         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1480         if (likely(*own_req)) {
1481                 tcp_move_syn(newtp, req);
1482                 ireq->ireq_opt = NULL;
1483         } else {
1484                 newinet->inet_opt = NULL;
1485         }
1486         return newsk;
1487
1488 exit_overflow:
1489         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1490 exit_nonewsk:
1491         dst_release(dst);
1492 exit:
1493         tcp_listendrop(sk);
1494         return NULL;
1495 put_and_exit:
1496         newinet->inet_opt = NULL;
1497         inet_csk_prepare_forced_close(newsk);
1498         tcp_done(newsk);
1499         goto exit;
1500 }
1501 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1502
1503 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1504 {
1505 #ifdef CONFIG_SYN_COOKIES
1506         const struct tcphdr *th = tcp_hdr(skb);
1507
1508         if (!th->syn)
1509                 sk = cookie_v4_check(sk, skb);
1510 #endif
1511         return sk;
1512 }
1513
1514 /* The socket must have it's spinlock held when we get
1515  * here, unless it is a TCP_LISTEN socket.
1516  *
1517  * We have a potential double-lock case here, so even when
1518  * doing backlog processing we use the BH locking scheme.
1519  * This is because we cannot sleep with the original spinlock
1520  * held.
1521  */
1522 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1523 {
1524         struct sock *rsk;
1525
1526         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1527                 struct dst_entry *dst = sk->sk_rx_dst;
1528
1529                 sock_rps_save_rxhash(sk, skb);
1530                 sk_mark_napi_id(sk, skb);
1531                 if (dst) {
1532                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1533                             !dst->ops->check(dst, 0)) {
1534                                 dst_release(dst);
1535                                 sk->sk_rx_dst = NULL;
1536                         }
1537                 }
1538                 tcp_rcv_established(sk, skb);
1539                 return 0;
1540         }
1541
1542         if (tcp_checksum_complete(skb))
1543                 goto csum_err;
1544
1545         if (sk->sk_state == TCP_LISTEN) {
1546                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1547
1548                 if (!nsk)
1549                         goto discard;
1550                 if (nsk != sk) {
1551                         if (tcp_child_process(sk, nsk, skb)) {
1552                                 rsk = nsk;
1553                                 goto reset;
1554                         }
1555                         return 0;
1556                 }
1557         } else
1558                 sock_rps_save_rxhash(sk, skb);
1559
1560         if (tcp_rcv_state_process(sk, skb)) {
1561                 rsk = sk;
1562                 goto reset;
1563         }
1564         return 0;
1565
1566 reset:
1567         tcp_v4_send_reset(rsk, skb);
1568 discard:
1569         kfree_skb(skb);
1570         /* Be careful here. If this function gets more complicated and
1571          * gcc suffers from register pressure on the x86, sk (in %ebx)
1572          * might be destroyed here. This current version compiles correctly,
1573          * but you have been warned.
1574          */
1575         return 0;
1576
1577 csum_err:
1578         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1579         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1580         goto discard;
1581 }
1582 EXPORT_SYMBOL(tcp_v4_do_rcv);
1583
1584 int tcp_v4_early_demux(struct sk_buff *skb)
1585 {
1586         const struct iphdr *iph;
1587         const struct tcphdr *th;
1588         struct sock *sk;
1589
1590         if (skb->pkt_type != PACKET_HOST)
1591                 return 0;
1592
1593         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1594                 return 0;
1595
1596         iph = ip_hdr(skb);
1597         th = tcp_hdr(skb);
1598
1599         if (th->doff < sizeof(struct tcphdr) / 4)
1600                 return 0;
1601
1602         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1603                                        iph->saddr, th->source,
1604                                        iph->daddr, ntohs(th->dest),
1605                                        skb->skb_iif, inet_sdif(skb));
1606         if (sk) {
1607                 skb->sk = sk;
1608                 skb->destructor = sock_edemux;
1609                 if (sk_fullsock(sk)) {
1610                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1611
1612                         if (dst)
1613                                 dst = dst_check(dst, 0);
1614                         if (dst &&
1615                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1616                                 skb_dst_set_noref(skb, dst);
1617                 }
1618         }
1619         return 0;
1620 }
1621
1622 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1623 {
1624         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1625         struct skb_shared_info *shinfo;
1626         const struct tcphdr *th;
1627         struct tcphdr *thtail;
1628         struct sk_buff *tail;
1629         unsigned int hdrlen;
1630         bool fragstolen;
1631         u32 gso_segs;
1632         int delta;
1633
1634         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1635          * we can fix skb->truesize to its real value to avoid future drops.
1636          * This is valid because skb is not yet charged to the socket.
1637          * It has been noticed pure SACK packets were sometimes dropped
1638          * (if cooked by drivers without copybreak feature).
1639          */
1640         skb_condense(skb);
1641
1642         skb_dst_drop(skb);
1643
1644         if (unlikely(tcp_checksum_complete(skb))) {
1645                 bh_unlock_sock(sk);
1646                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1647                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1648                 return true;
1649         }
1650
1651         /* Attempt coalescing to last skb in backlog, even if we are
1652          * above the limits.
1653          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1654          */
1655         th = (const struct tcphdr *)skb->data;
1656         hdrlen = th->doff * 4;
1657         shinfo = skb_shinfo(skb);
1658
1659         if (!shinfo->gso_size)
1660                 shinfo->gso_size = skb->len - hdrlen;
1661
1662         if (!shinfo->gso_segs)
1663                 shinfo->gso_segs = 1;
1664
1665         tail = sk->sk_backlog.tail;
1666         if (!tail)
1667                 goto no_coalesce;
1668         thtail = (struct tcphdr *)tail->data;
1669
1670         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1671             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1672             ((TCP_SKB_CB(tail)->tcp_flags |
1673               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
1674             ((TCP_SKB_CB(tail)->tcp_flags ^
1675               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1676 #ifdef CONFIG_TLS_DEVICE
1677             tail->decrypted != skb->decrypted ||
1678 #endif
1679             thtail->doff != th->doff ||
1680             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1681                 goto no_coalesce;
1682
1683         __skb_pull(skb, hdrlen);
1684         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1685                 thtail->window = th->window;
1686
1687                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1688
1689                 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1690                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1691
1692                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1693
1694                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1695                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1696                         tail->tstamp = skb->tstamp;
1697                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1698                 }
1699
1700                 /* Not as strict as GRO. We only need to carry mss max value */
1701                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1702                                                  skb_shinfo(tail)->gso_size);
1703
1704                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1705                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1706
1707                 sk->sk_backlog.len += delta;
1708                 __NET_INC_STATS(sock_net(sk),
1709                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1710                 kfree_skb_partial(skb, fragstolen);
1711                 return false;
1712         }
1713         __skb_push(skb, hdrlen);
1714
1715 no_coalesce:
1716         /* Only socket owner can try to collapse/prune rx queues
1717          * to reduce memory overhead, so add a little headroom here.
1718          * Few sockets backlog are possibly concurrently non empty.
1719          */
1720         limit += 64*1024;
1721
1722         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1723                 bh_unlock_sock(sk);
1724                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1725                 return true;
1726         }
1727         return false;
1728 }
1729 EXPORT_SYMBOL(tcp_add_backlog);
1730
1731 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1732 {
1733         struct tcphdr *th = (struct tcphdr *)skb->data;
1734         unsigned int eaten = skb->len;
1735         int err;
1736
1737         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1738         if (!err) {
1739                 eaten -= skb->len;
1740                 TCP_SKB_CB(skb)->end_seq -= eaten;
1741         }
1742         return err;
1743 }
1744 EXPORT_SYMBOL(tcp_filter);
1745
1746 static void tcp_v4_restore_cb(struct sk_buff *skb)
1747 {
1748         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1749                 sizeof(struct inet_skb_parm));
1750 }
1751
1752 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1753                            const struct tcphdr *th)
1754 {
1755         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1756          * barrier() makes sure compiler wont play fool^Waliasing games.
1757          */
1758         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1759                 sizeof(struct inet_skb_parm));
1760         barrier();
1761
1762         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1763         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1764                                     skb->len - th->doff * 4);
1765         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1766         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1767         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1768         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1769         TCP_SKB_CB(skb)->sacked  = 0;
1770         TCP_SKB_CB(skb)->has_rxtstamp =
1771                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1772 }
1773
1774 /*
1775  *      From tcp_input.c
1776  */
1777
1778 int tcp_v4_rcv(struct sk_buff *skb)
1779 {
1780         struct net *net = dev_net(skb->dev);
1781         int sdif = inet_sdif(skb);
1782         const struct iphdr *iph;
1783         const struct tcphdr *th;
1784         bool refcounted;
1785         struct sock *sk;
1786         int ret;
1787
1788         if (skb->pkt_type != PACKET_HOST)
1789                 goto discard_it;
1790
1791         /* Count it even if it's bad */
1792         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1793
1794         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1795                 goto discard_it;
1796
1797         th = (const struct tcphdr *)skb->data;
1798
1799         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1800                 goto bad_packet;
1801         if (!pskb_may_pull(skb, th->doff * 4))
1802                 goto discard_it;
1803
1804         /* An explanation is required here, I think.
1805          * Packet length and doff are validated by header prediction,
1806          * provided case of th->doff==0 is eliminated.
1807          * So, we defer the checks. */
1808
1809         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1810                 goto csum_error;
1811
1812         th = (const struct tcphdr *)skb->data;
1813         iph = ip_hdr(skb);
1814 lookup:
1815         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1816                                th->dest, sdif, &refcounted);
1817         if (!sk)
1818                 goto no_tcp_socket;
1819
1820 process:
1821         if (sk->sk_state == TCP_TIME_WAIT)
1822                 goto do_time_wait;
1823
1824         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1825                 struct request_sock *req = inet_reqsk(sk);
1826                 bool req_stolen = false;
1827                 struct sock *nsk;
1828
1829                 sk = req->rsk_listener;
1830                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1831                         sk_drops_add(sk, skb);
1832                         reqsk_put(req);
1833                         goto discard_it;
1834                 }
1835                 if (tcp_checksum_complete(skb)) {
1836                         reqsk_put(req);
1837                         goto csum_error;
1838                 }
1839                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1840                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1841                         goto lookup;
1842                 }
1843                 /* We own a reference on the listener, increase it again
1844                  * as we might lose it too soon.
1845                  */
1846                 sock_hold(sk);
1847                 refcounted = true;
1848                 nsk = NULL;
1849                 if (!tcp_filter(sk, skb)) {
1850                         th = (const struct tcphdr *)skb->data;
1851                         iph = ip_hdr(skb);
1852                         tcp_v4_fill_cb(skb, iph, th);
1853                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1854                 }
1855                 if (!nsk) {
1856                         reqsk_put(req);
1857                         if (req_stolen) {
1858                                 /* Another cpu got exclusive access to req
1859                                  * and created a full blown socket.
1860                                  * Try to feed this packet to this socket
1861                                  * instead of discarding it.
1862                                  */
1863                                 tcp_v4_restore_cb(skb);
1864                                 sock_put(sk);
1865                                 goto lookup;
1866                         }
1867                         goto discard_and_relse;
1868                 }
1869                 if (nsk == sk) {
1870                         reqsk_put(req);
1871                         tcp_v4_restore_cb(skb);
1872                 } else if (tcp_child_process(sk, nsk, skb)) {
1873                         tcp_v4_send_reset(nsk, skb);
1874                         goto discard_and_relse;
1875                 } else {
1876                         sock_put(sk);
1877                         return 0;
1878                 }
1879         }
1880         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1881                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1882                 goto discard_and_relse;
1883         }
1884
1885         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1886                 goto discard_and_relse;
1887
1888         if (tcp_v4_inbound_md5_hash(sk, skb))
1889                 goto discard_and_relse;
1890
1891         nf_reset(skb);
1892
1893         if (tcp_filter(sk, skb))
1894                 goto discard_and_relse;
1895         th = (const struct tcphdr *)skb->data;
1896         iph = ip_hdr(skb);
1897         tcp_v4_fill_cb(skb, iph, th);
1898
1899         skb->dev = NULL;
1900
1901         if (sk->sk_state == TCP_LISTEN) {
1902                 ret = tcp_v4_do_rcv(sk, skb);
1903                 goto put_and_return;
1904         }
1905
1906         sk_incoming_cpu_update(sk);
1907
1908         bh_lock_sock_nested(sk);
1909         tcp_segs_in(tcp_sk(sk), skb);
1910         ret = 0;
1911         if (!sock_owned_by_user(sk)) {
1912                 ret = tcp_v4_do_rcv(sk, skb);
1913         } else if (tcp_add_backlog(sk, skb)) {
1914                 goto discard_and_relse;
1915         }
1916         bh_unlock_sock(sk);
1917
1918 put_and_return:
1919         if (refcounted)
1920                 sock_put(sk);
1921
1922         return ret;
1923
1924 no_tcp_socket:
1925         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1926                 goto discard_it;
1927
1928         tcp_v4_fill_cb(skb, iph, th);
1929
1930         if (tcp_checksum_complete(skb)) {
1931 csum_error:
1932                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1933 bad_packet:
1934                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1935         } else {
1936                 tcp_v4_send_reset(NULL, skb);
1937         }
1938
1939 discard_it:
1940         /* Discard frame. */
1941         kfree_skb(skb);
1942         return 0;
1943
1944 discard_and_relse:
1945         sk_drops_add(sk, skb);
1946         if (refcounted)
1947                 sock_put(sk);
1948         goto discard_it;
1949
1950 do_time_wait:
1951         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1952                 inet_twsk_put(inet_twsk(sk));
1953                 goto discard_it;
1954         }
1955
1956         tcp_v4_fill_cb(skb, iph, th);
1957
1958         if (tcp_checksum_complete(skb)) {
1959                 inet_twsk_put(inet_twsk(sk));
1960                 goto csum_error;
1961         }
1962         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1963         case TCP_TW_SYN: {
1964                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1965                                                         &tcp_hashinfo, skb,
1966                                                         __tcp_hdrlen(th),
1967                                                         iph->saddr, th->source,
1968                                                         iph->daddr, th->dest,
1969                                                         inet_iif(skb),
1970                                                         sdif);
1971                 if (sk2) {
1972                         inet_twsk_deschedule_put(inet_twsk(sk));
1973                         sk = sk2;
1974                         tcp_v4_restore_cb(skb);
1975                         refcounted = false;
1976                         goto process;
1977                 }
1978         }
1979                 /* to ACK */
1980                 /* fall through */
1981         case TCP_TW_ACK:
1982                 tcp_v4_timewait_ack(sk, skb);
1983                 break;
1984         case TCP_TW_RST:
1985                 tcp_v4_send_reset(sk, skb);
1986                 inet_twsk_deschedule_put(inet_twsk(sk));
1987                 goto discard_it;
1988         case TCP_TW_SUCCESS:;
1989         }
1990         goto discard_it;
1991 }
1992
1993 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1994         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1995         .twsk_unique    = tcp_twsk_unique,
1996         .twsk_destructor= tcp_twsk_destructor,
1997 };
1998
1999 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2000 {
2001         struct dst_entry *dst = skb_dst(skb);
2002
2003         if (dst && dst_hold_safe(dst)) {
2004                 sk->sk_rx_dst = dst;
2005                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2006         }
2007 }
2008 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2009
2010 const struct inet_connection_sock_af_ops ipv4_specific = {
2011         .queue_xmit        = ip_queue_xmit,
2012         .send_check        = tcp_v4_send_check,
2013         .rebuild_header    = inet_sk_rebuild_header,
2014         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2015         .conn_request      = tcp_v4_conn_request,
2016         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2017         .net_header_len    = sizeof(struct iphdr),
2018         .setsockopt        = ip_setsockopt,
2019         .getsockopt        = ip_getsockopt,
2020         .addr2sockaddr     = inet_csk_addr2sockaddr,
2021         .sockaddr_len      = sizeof(struct sockaddr_in),
2022 #ifdef CONFIG_COMPAT
2023         .compat_setsockopt = compat_ip_setsockopt,
2024         .compat_getsockopt = compat_ip_getsockopt,
2025 #endif
2026         .mtu_reduced       = tcp_v4_mtu_reduced,
2027 };
2028 EXPORT_SYMBOL(ipv4_specific);
2029
2030 #ifdef CONFIG_TCP_MD5SIG
2031 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2032         .md5_lookup             = tcp_v4_md5_lookup,
2033         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2034         .md5_parse              = tcp_v4_parse_md5_keys,
2035 };
2036 #endif
2037
2038 /* NOTE: A lot of things set to zero explicitly by call to
2039  *       sk_alloc() so need not be done here.
2040  */
2041 static int tcp_v4_init_sock(struct sock *sk)
2042 {
2043         struct inet_connection_sock *icsk = inet_csk(sk);
2044
2045         tcp_init_sock(sk);
2046
2047         icsk->icsk_af_ops = &ipv4_specific;
2048
2049 #ifdef CONFIG_TCP_MD5SIG
2050         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2051 #endif
2052
2053         return 0;
2054 }
2055
2056 void tcp_v4_destroy_sock(struct sock *sk)
2057 {
2058         struct tcp_sock *tp = tcp_sk(sk);
2059
2060         trace_tcp_destroy_sock(sk);
2061
2062         tcp_clear_xmit_timers(sk);
2063
2064         tcp_cleanup_congestion_control(sk);
2065
2066         tcp_cleanup_ulp(sk);
2067
2068         /* Cleanup up the write buffer. */
2069         tcp_write_queue_purge(sk);
2070
2071         /* Check if we want to disable active TFO */
2072         tcp_fastopen_active_disable_ofo_check(sk);
2073
2074         /* Cleans up our, hopefully empty, out_of_order_queue. */
2075         skb_rbtree_purge(&tp->out_of_order_queue);
2076
2077 #ifdef CONFIG_TCP_MD5SIG
2078         /* Clean up the MD5 key list, if any */
2079         if (tp->md5sig_info) {
2080                 tcp_clear_md5_list(sk);
2081                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2082                 tp->md5sig_info = NULL;
2083         }
2084 #endif
2085
2086         /* Clean up a referenced TCP bind bucket. */
2087         if (inet_csk(sk)->icsk_bind_hash)
2088                 inet_put_port(sk);
2089
2090         BUG_ON(tp->fastopen_rsk);
2091
2092         /* If socket is aborted during connect operation */
2093         tcp_free_fastopen_req(tp);
2094         tcp_fastopen_destroy_cipher(sk);
2095         tcp_saved_syn_free(tp);
2096
2097         sk_sockets_allocated_dec(sk);
2098 }
2099 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2100
2101 #ifdef CONFIG_PROC_FS
2102 /* Proc filesystem TCP sock list dumping. */
2103
2104 /*
2105  * Get next listener socket follow cur.  If cur is NULL, get first socket
2106  * starting from bucket given in st->bucket; when st->bucket is zero the
2107  * very first socket in the hash table is returned.
2108  */
2109 static void *listening_get_next(struct seq_file *seq, void *cur)
2110 {
2111         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2112         struct tcp_iter_state *st = seq->private;
2113         struct net *net = seq_file_net(seq);
2114         struct inet_listen_hashbucket *ilb;
2115         struct sock *sk = cur;
2116
2117         if (!sk) {
2118 get_head:
2119                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2120                 spin_lock(&ilb->lock);
2121                 sk = sk_head(&ilb->head);
2122                 st->offset = 0;
2123                 goto get_sk;
2124         }
2125         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2126         ++st->num;
2127         ++st->offset;
2128
2129         sk = sk_next(sk);
2130 get_sk:
2131         sk_for_each_from(sk) {
2132                 if (!net_eq(sock_net(sk), net))
2133                         continue;
2134                 if (sk->sk_family == afinfo->family)
2135                         return sk;
2136         }
2137         spin_unlock(&ilb->lock);
2138         st->offset = 0;
2139         if (++st->bucket < INET_LHTABLE_SIZE)
2140                 goto get_head;
2141         return NULL;
2142 }
2143
2144 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2145 {
2146         struct tcp_iter_state *st = seq->private;
2147         void *rc;
2148
2149         st->bucket = 0;
2150         st->offset = 0;
2151         rc = listening_get_next(seq, NULL);
2152
2153         while (rc && *pos) {
2154                 rc = listening_get_next(seq, rc);
2155                 --*pos;
2156         }
2157         return rc;
2158 }
2159
2160 static inline bool empty_bucket(const struct tcp_iter_state *st)
2161 {
2162         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2163 }
2164
2165 /*
2166  * Get first established socket starting from bucket given in st->bucket.
2167  * If st->bucket is zero, the very first socket in the hash is returned.
2168  */
2169 static void *established_get_first(struct seq_file *seq)
2170 {
2171         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2172         struct tcp_iter_state *st = seq->private;
2173         struct net *net = seq_file_net(seq);
2174         void *rc = NULL;
2175
2176         st->offset = 0;
2177         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2178                 struct sock *sk;
2179                 struct hlist_nulls_node *node;
2180                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2181
2182                 /* Lockless fast path for the common case of empty buckets */
2183                 if (empty_bucket(st))
2184                         continue;
2185
2186                 spin_lock_bh(lock);
2187                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2188                         if (sk->sk_family != afinfo->family ||
2189                             !net_eq(sock_net(sk), net)) {
2190                                 continue;
2191                         }
2192                         rc = sk;
2193                         goto out;
2194                 }
2195                 spin_unlock_bh(lock);
2196         }
2197 out:
2198         return rc;
2199 }
2200
2201 static void *established_get_next(struct seq_file *seq, void *cur)
2202 {
2203         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2204         struct sock *sk = cur;
2205         struct hlist_nulls_node *node;
2206         struct tcp_iter_state *st = seq->private;
2207         struct net *net = seq_file_net(seq);
2208
2209         ++st->num;
2210         ++st->offset;
2211
2212         sk = sk_nulls_next(sk);
2213
2214         sk_nulls_for_each_from(sk, node) {
2215                 if (sk->sk_family == afinfo->family &&
2216                     net_eq(sock_net(sk), net))
2217                         return sk;
2218         }
2219
2220         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2221         ++st->bucket;
2222         return established_get_first(seq);
2223 }
2224
2225 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2226 {
2227         struct tcp_iter_state *st = seq->private;
2228         void *rc;
2229
2230         st->bucket = 0;
2231         rc = established_get_first(seq);
2232
2233         while (rc && pos) {
2234                 rc = established_get_next(seq, rc);
2235                 --pos;
2236         }
2237         return rc;
2238 }
2239
2240 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2241 {
2242         void *rc;
2243         struct tcp_iter_state *st = seq->private;
2244
2245         st->state = TCP_SEQ_STATE_LISTENING;
2246         rc        = listening_get_idx(seq, &pos);
2247
2248         if (!rc) {
2249                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2250                 rc        = established_get_idx(seq, pos);
2251         }
2252
2253         return rc;
2254 }
2255
2256 static void *tcp_seek_last_pos(struct seq_file *seq)
2257 {
2258         struct tcp_iter_state *st = seq->private;
2259         int offset = st->offset;
2260         int orig_num = st->num;
2261         void *rc = NULL;
2262
2263         switch (st->state) {
2264         case TCP_SEQ_STATE_LISTENING:
2265                 if (st->bucket >= INET_LHTABLE_SIZE)
2266                         break;
2267                 st->state = TCP_SEQ_STATE_LISTENING;
2268                 rc = listening_get_next(seq, NULL);
2269                 while (offset-- && rc)
2270                         rc = listening_get_next(seq, rc);
2271                 if (rc)
2272                         break;
2273                 st->bucket = 0;
2274                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2275                 /* Fallthrough */
2276         case TCP_SEQ_STATE_ESTABLISHED:
2277                 if (st->bucket > tcp_hashinfo.ehash_mask)
2278                         break;
2279                 rc = established_get_first(seq);
2280                 while (offset-- && rc)
2281                         rc = established_get_next(seq, rc);
2282         }
2283
2284         st->num = orig_num;
2285
2286         return rc;
2287 }
2288
2289 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2290 {
2291         struct tcp_iter_state *st = seq->private;
2292         void *rc;
2293
2294         if (*pos && *pos == st->last_pos) {
2295                 rc = tcp_seek_last_pos(seq);
2296                 if (rc)
2297                         goto out;
2298         }
2299
2300         st->state = TCP_SEQ_STATE_LISTENING;
2301         st->num = 0;
2302         st->bucket = 0;
2303         st->offset = 0;
2304         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2305
2306 out:
2307         st->last_pos = *pos;
2308         return rc;
2309 }
2310 EXPORT_SYMBOL(tcp_seq_start);
2311
2312 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2313 {
2314         struct tcp_iter_state *st = seq->private;
2315         void *rc = NULL;
2316
2317         if (v == SEQ_START_TOKEN) {
2318                 rc = tcp_get_idx(seq, 0);
2319                 goto out;
2320         }
2321
2322         switch (st->state) {
2323         case TCP_SEQ_STATE_LISTENING:
2324                 rc = listening_get_next(seq, v);
2325                 if (!rc) {
2326                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2327                         st->bucket = 0;
2328                         st->offset = 0;
2329                         rc        = established_get_first(seq);
2330                 }
2331                 break;
2332         case TCP_SEQ_STATE_ESTABLISHED:
2333                 rc = established_get_next(seq, v);
2334                 break;
2335         }
2336 out:
2337         ++*pos;
2338         st->last_pos = *pos;
2339         return rc;
2340 }
2341 EXPORT_SYMBOL(tcp_seq_next);
2342
2343 void tcp_seq_stop(struct seq_file *seq, void *v)
2344 {
2345         struct tcp_iter_state *st = seq->private;
2346
2347         switch (st->state) {
2348         case TCP_SEQ_STATE_LISTENING:
2349                 if (v != SEQ_START_TOKEN)
2350                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2351                 break;
2352         case TCP_SEQ_STATE_ESTABLISHED:
2353                 if (v)
2354                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2355                 break;
2356         }
2357 }
2358 EXPORT_SYMBOL(tcp_seq_stop);
2359
2360 static void get_openreq4(const struct request_sock *req,
2361                          struct seq_file *f, int i)
2362 {
2363         const struct inet_request_sock *ireq = inet_rsk(req);
2364         long delta = req->rsk_timer.expires - jiffies;
2365
2366         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2367                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2368                 i,
2369                 ireq->ir_loc_addr,
2370                 ireq->ir_num,
2371                 ireq->ir_rmt_addr,
2372                 ntohs(ireq->ir_rmt_port),
2373                 TCP_SYN_RECV,
2374                 0, 0, /* could print option size, but that is af dependent. */
2375                 1,    /* timers active (only the expire timer) */
2376                 jiffies_delta_to_clock_t(delta),
2377                 req->num_timeout,
2378                 from_kuid_munged(seq_user_ns(f),
2379                                  sock_i_uid(req->rsk_listener)),
2380                 0,  /* non standard timer */
2381                 0, /* open_requests have no inode */
2382                 0,
2383                 req);
2384 }
2385
2386 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2387 {
2388         int timer_active;
2389         unsigned long timer_expires;
2390         const struct tcp_sock *tp = tcp_sk(sk);
2391         const struct inet_connection_sock *icsk = inet_csk(sk);
2392         const struct inet_sock *inet = inet_sk(sk);
2393         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2394         __be32 dest = inet->inet_daddr;
2395         __be32 src = inet->inet_rcv_saddr;
2396         __u16 destp = ntohs(inet->inet_dport);
2397         __u16 srcp = ntohs(inet->inet_sport);
2398         int rx_queue;
2399         int state;
2400
2401         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2402             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2403             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2404                 timer_active    = 1;
2405                 timer_expires   = icsk->icsk_timeout;
2406         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2407                 timer_active    = 4;
2408                 timer_expires   = icsk->icsk_timeout;
2409         } else if (timer_pending(&sk->sk_timer)) {
2410                 timer_active    = 2;
2411                 timer_expires   = sk->sk_timer.expires;
2412         } else {
2413                 timer_active    = 0;
2414                 timer_expires = jiffies;
2415         }
2416
2417         state = inet_sk_state_load(sk);
2418         if (state == TCP_LISTEN)
2419                 rx_queue = sk->sk_ack_backlog;
2420         else
2421                 /* Because we don't lock the socket,
2422                  * we might find a transient negative value.
2423                  */
2424                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2425
2426         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2427                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2428                 i, src, srcp, dest, destp, state,
2429                 tp->write_seq - tp->snd_una,
2430                 rx_queue,
2431                 timer_active,
2432                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2433                 icsk->icsk_retransmits,
2434                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2435                 icsk->icsk_probes_out,
2436                 sock_i_ino(sk),
2437                 refcount_read(&sk->sk_refcnt), sk,
2438                 jiffies_to_clock_t(icsk->icsk_rto),
2439                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2440                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2441                 tp->snd_cwnd,
2442                 state == TCP_LISTEN ?
2443                     fastopenq->max_qlen :
2444                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2445 }
2446
2447 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2448                                struct seq_file *f, int i)
2449 {
2450         long delta = tw->tw_timer.expires - jiffies;
2451         __be32 dest, src;
2452         __u16 destp, srcp;
2453
2454         dest  = tw->tw_daddr;
2455         src   = tw->tw_rcv_saddr;
2456         destp = ntohs(tw->tw_dport);
2457         srcp  = ntohs(tw->tw_sport);
2458
2459         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2460                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2461                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2462                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2463                 refcount_read(&tw->tw_refcnt), tw);
2464 }
2465
2466 #define TMPSZ 150
2467
2468 static int tcp4_seq_show(struct seq_file *seq, void *v)
2469 {
2470         struct tcp_iter_state *st;
2471         struct sock *sk = v;
2472
2473         seq_setwidth(seq, TMPSZ - 1);
2474         if (v == SEQ_START_TOKEN) {
2475                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2476                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2477                            "inode");
2478                 goto out;
2479         }
2480         st = seq->private;
2481
2482         if (sk->sk_state == TCP_TIME_WAIT)
2483                 get_timewait4_sock(v, seq, st->num);
2484         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2485                 get_openreq4(v, seq, st->num);
2486         else
2487                 get_tcp4_sock(v, seq, st->num);
2488 out:
2489         seq_pad(seq, '\n');
2490         return 0;
2491 }
2492
2493 static const struct seq_operations tcp4_seq_ops = {
2494         .show           = tcp4_seq_show,
2495         .start          = tcp_seq_start,
2496         .next           = tcp_seq_next,
2497         .stop           = tcp_seq_stop,
2498 };
2499
2500 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2501         .family         = AF_INET,
2502 };
2503
2504 static int __net_init tcp4_proc_init_net(struct net *net)
2505 {
2506         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2507                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2508                 return -ENOMEM;
2509         return 0;
2510 }
2511
2512 static void __net_exit tcp4_proc_exit_net(struct net *net)
2513 {
2514         remove_proc_entry("tcp", net->proc_net);
2515 }
2516
2517 static struct pernet_operations tcp4_net_ops = {
2518         .init = tcp4_proc_init_net,
2519         .exit = tcp4_proc_exit_net,
2520 };
2521
2522 int __init tcp4_proc_init(void)
2523 {
2524         return register_pernet_subsys(&tcp4_net_ops);
2525 }
2526
2527 void tcp4_proc_exit(void)
2528 {
2529         unregister_pernet_subsys(&tcp4_net_ops);
2530 }
2531 #endif /* CONFIG_PROC_FS */
2532
2533 struct proto tcp_prot = {
2534         .name                   = "TCP",
2535         .owner                  = THIS_MODULE,
2536         .close                  = tcp_close,
2537         .pre_connect            = tcp_v4_pre_connect,
2538         .connect                = tcp_v4_connect,
2539         .disconnect             = tcp_disconnect,
2540         .accept                 = inet_csk_accept,
2541         .ioctl                  = tcp_ioctl,
2542         .init                   = tcp_v4_init_sock,
2543         .destroy                = tcp_v4_destroy_sock,
2544         .shutdown               = tcp_shutdown,
2545         .setsockopt             = tcp_setsockopt,
2546         .getsockopt             = tcp_getsockopt,
2547         .keepalive              = tcp_set_keepalive,
2548         .recvmsg                = tcp_recvmsg,
2549         .sendmsg                = tcp_sendmsg,
2550         .sendpage               = tcp_sendpage,
2551         .backlog_rcv            = tcp_v4_do_rcv,
2552         .release_cb             = tcp_release_cb,
2553         .hash                   = inet_hash,
2554         .unhash                 = inet_unhash,
2555         .get_port               = inet_csk_get_port,
2556         .enter_memory_pressure  = tcp_enter_memory_pressure,
2557         .leave_memory_pressure  = tcp_leave_memory_pressure,
2558         .stream_memory_free     = tcp_stream_memory_free,
2559         .sockets_allocated      = &tcp_sockets_allocated,
2560         .orphan_count           = &tcp_orphan_count,
2561         .memory_allocated       = &tcp_memory_allocated,
2562         .memory_pressure        = &tcp_memory_pressure,
2563         .sysctl_mem             = sysctl_tcp_mem,
2564         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2565         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2566         .max_header             = MAX_TCP_HEADER,
2567         .obj_size               = sizeof(struct tcp_sock),
2568         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2569         .twsk_prot              = &tcp_timewait_sock_ops,
2570         .rsk_prot               = &tcp_request_sock_ops,
2571         .h.hashinfo             = &tcp_hashinfo,
2572         .no_autobind            = true,
2573 #ifdef CONFIG_COMPAT
2574         .compat_setsockopt      = compat_tcp_setsockopt,
2575         .compat_getsockopt      = compat_tcp_getsockopt,
2576 #endif
2577         .diag_destroy           = tcp_abort,
2578 };
2579 EXPORT_SYMBOL(tcp_prot);
2580
2581 static void __net_exit tcp_sk_exit(struct net *net)
2582 {
2583         int cpu;
2584
2585         module_put(net->ipv4.tcp_congestion_control->owner);
2586
2587         for_each_possible_cpu(cpu)
2588                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2589         free_percpu(net->ipv4.tcp_sk);
2590 }
2591
2592 static int __net_init tcp_sk_init(struct net *net)
2593 {
2594         int res, cpu, cnt;
2595
2596         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2597         if (!net->ipv4.tcp_sk)
2598                 return -ENOMEM;
2599
2600         for_each_possible_cpu(cpu) {
2601                 struct sock *sk;
2602
2603                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2604                                            IPPROTO_TCP, net);
2605                 if (res)
2606                         goto fail;
2607                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2608
2609                 /* Please enforce IP_DF and IPID==0 for RST and
2610                  * ACK sent in SYN-RECV and TIME-WAIT state.
2611                  */
2612                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2613
2614                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2615         }
2616
2617         net->ipv4.sysctl_tcp_ecn = 2;
2618         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2619
2620         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2621         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2622         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2623
2624         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2625         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2626         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2627
2628         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2629         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2630         net->ipv4.sysctl_tcp_syncookies = 1;
2631         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2632         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2633         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2634         net->ipv4.sysctl_tcp_orphan_retries = 0;
2635         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2636         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2637         net->ipv4.sysctl_tcp_tw_reuse = 2;
2638
2639         cnt = tcp_hashinfo.ehash_mask + 1;
2640         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2641         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2642
2643         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2644         net->ipv4.sysctl_tcp_sack = 1;
2645         net->ipv4.sysctl_tcp_window_scaling = 1;
2646         net->ipv4.sysctl_tcp_timestamps = 1;
2647         net->ipv4.sysctl_tcp_early_retrans = 3;
2648         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2649         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2650         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2651         net->ipv4.sysctl_tcp_max_reordering = 300;
2652         net->ipv4.sysctl_tcp_dsack = 1;
2653         net->ipv4.sysctl_tcp_app_win = 31;
2654         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2655         net->ipv4.sysctl_tcp_frto = 2;
2656         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2657         /* This limits the percentage of the congestion window which we
2658          * will allow a single TSO frame to consume.  Building TSO frames
2659          * which are too large can cause TCP streams to be bursty.
2660          */
2661         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2662         /* Default TSQ limit of 16 TSO segments */
2663         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2664         /* rfc5961 challenge ack rate limiting */
2665         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2666         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2667         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2668         net->ipv4.sysctl_tcp_autocorking = 1;
2669         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2670         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2671         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2672         if (net != &init_net) {
2673                 memcpy(net->ipv4.sysctl_tcp_rmem,
2674                        init_net.ipv4.sysctl_tcp_rmem,
2675                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2676                 memcpy(net->ipv4.sysctl_tcp_wmem,
2677                        init_net.ipv4.sysctl_tcp_wmem,
2678                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2679         }
2680         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2681         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2682         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2683         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2684         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2685         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2686
2687         /* Reno is always built in */
2688         if (!net_eq(net, &init_net) &&
2689             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2690                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2691         else
2692                 net->ipv4.tcp_congestion_control = &tcp_reno;
2693
2694         return 0;
2695 fail:
2696         tcp_sk_exit(net);
2697
2698         return res;
2699 }
2700
2701 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2702 {
2703         struct net *net;
2704
2705         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2706
2707         list_for_each_entry(net, net_exit_list, exit_list)
2708                 tcp_fastopen_ctx_destroy(net);
2709 }
2710
2711 static struct pernet_operations __net_initdata tcp_sk_ops = {
2712        .init       = tcp_sk_init,
2713        .exit       = tcp_sk_exit,
2714        .exit_batch = tcp_sk_exit_batch,
2715 };
2716
2717 void __init tcp_v4_init(void)
2718 {
2719         if (register_pernet_subsys(&tcp_sk_ops))
2720                 panic("Failed to create the TCP control socket.\n");
2721 }