net: tcp: remove BUG_ON from tcp_v4_err
[linux-2.6-microblaze.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #include <trace/events/tcp.h>
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118         if (reuse == 2) {
119                 /* Still does not detect *everything* that goes through
120                  * lo, since we require a loopback src or dst address
121                  * or direct binding to 'lo' interface.
122                  */
123                 bool loopback = false;
124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125                         loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127                 if (tw->tw_family == AF_INET6) {
128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134                                 loopback = true;
135                 } else
136 #endif
137                 {
138                         if (ipv4_is_loopback(tw->tw_daddr) ||
139                             ipv4_is_loopback(tw->tw_rcv_saddr))
140                                 loopback = true;
141                 }
142                 if (!loopback)
143                         reuse = 0;
144         }
145
146         /* With PAWS, it is safe from the viewpoint
147            of data integrity. Even without PAWS it is safe provided sequence
148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150            Actually, the idea is close to VJ's one, only timestamp cache is
151            held not per host, but per port pair and TW bucket is used as state
152            holder.
153
154            If TW bucket has been already destroyed we fall back to VJ's scheme
155            and use initial timestamp retrieved from peer table.
156          */
157         if (tcptw->tw_ts_recent_stamp &&
158             (!twp || (reuse && time_after32(ktime_get_seconds(),
159                                             tcptw->tw_ts_recent_stamp)))) {
160                 /* In case of repair and re-using TIME-WAIT sockets we still
161                  * want to be sure that it is safe as above but honor the
162                  * sequence numbers and time stamps set as part of the repair
163                  * process.
164                  *
165                  * Without this check re-using a TIME-WAIT socket with TCP
166                  * repair would accumulate a -1 on the repair assigned
167                  * sequence number. The first time it is reused the sequence
168                  * is -1, the second time -2, etc. This fixes that issue
169                  * without appearing to create any others.
170                  */
171                 if (likely(!tp->repair)) {
172                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
173                         if (tp->write_seq == 0)
174                                 tp->write_seq = 1;
175                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
176                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
177                 }
178                 sock_hold(sktw);
179                 return 1;
180         }
181
182         return 0;
183 }
184 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
185
186 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187                               int addr_len)
188 {
189         /* This check is replicated from tcp_v4_connect() and intended to
190          * prevent BPF program called below from accessing bytes that are out
191          * of the bound specified by user in addr_len.
192          */
193         if (addr_len < sizeof(struct sockaddr_in))
194                 return -EINVAL;
195
196         sock_owned_by_me(sk);
197
198         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 }
200
201 /* This will initiate an outgoing connection. */
202 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203 {
204         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
205         struct inet_sock *inet = inet_sk(sk);
206         struct tcp_sock *tp = tcp_sk(sk);
207         __be16 orig_sport, orig_dport;
208         __be32 daddr, nexthop;
209         struct flowi4 *fl4;
210         struct rtable *rt;
211         int err;
212         struct ip_options_rcu *inet_opt;
213         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
214
215         if (addr_len < sizeof(struct sockaddr_in))
216                 return -EINVAL;
217
218         if (usin->sin_family != AF_INET)
219                 return -EAFNOSUPPORT;
220
221         nexthop = daddr = usin->sin_addr.s_addr;
222         inet_opt = rcu_dereference_protected(inet->inet_opt,
223                                              lockdep_sock_is_held(sk));
224         if (inet_opt && inet_opt->opt.srr) {
225                 if (!daddr)
226                         return -EINVAL;
227                 nexthop = inet_opt->opt.faddr;
228         }
229
230         orig_sport = inet->inet_sport;
231         orig_dport = usin->sin_port;
232         fl4 = &inet->cork.fl.u.ip4;
233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
235                               IPPROTO_TCP,
236                               orig_sport, orig_dport, sk);
237         if (IS_ERR(rt)) {
238                 err = PTR_ERR(rt);
239                 if (err == -ENETUNREACH)
240                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
241                 return err;
242         }
243
244         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245                 ip_rt_put(rt);
246                 return -ENETUNREACH;
247         }
248
249         if (!inet_opt || !inet_opt->opt.srr)
250                 daddr = fl4->daddr;
251
252         if (!inet->inet_saddr)
253                 inet->inet_saddr = fl4->saddr;
254         sk_rcv_saddr_set(sk, inet->inet_saddr);
255
256         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
257                 /* Reset inherited state */
258                 tp->rx_opt.ts_recent       = 0;
259                 tp->rx_opt.ts_recent_stamp = 0;
260                 if (likely(!tp->repair))
261                         tp->write_seq      = 0;
262         }
263
264         inet->inet_dport = usin->sin_port;
265         sk_daddr_set(sk, daddr);
266
267         inet_csk(sk)->icsk_ext_hdr_len = 0;
268         if (inet_opt)
269                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
270
271         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
272
273         /* Socket identity is still unknown (sport may be zero).
274          * However we set state to SYN-SENT and not releasing socket
275          * lock select source port, enter ourselves into the hash tables and
276          * complete initialization after this.
277          */
278         tcp_set_state(sk, TCP_SYN_SENT);
279         err = inet_hash_connect(tcp_death_row, sk);
280         if (err)
281                 goto failure;
282
283         sk_set_txhash(sk);
284
285         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
286                                inet->inet_sport, inet->inet_dport, sk);
287         if (IS_ERR(rt)) {
288                 err = PTR_ERR(rt);
289                 rt = NULL;
290                 goto failure;
291         }
292         /* OK, now commit destination to socket.  */
293         sk->sk_gso_type = SKB_GSO_TCPV4;
294         sk_setup_caps(sk, &rt->dst);
295         rt = NULL;
296
297         if (likely(!tp->repair)) {
298                 if (!tp->write_seq)
299                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
300                                                        inet->inet_daddr,
301                                                        inet->inet_sport,
302                                                        usin->sin_port);
303                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304                                                  inet->inet_saddr,
305                                                  inet->inet_daddr);
306         }
307
308         inet->inet_id = tp->write_seq ^ jiffies;
309
310         if (tcp_fastopen_defer_connect(sk, &err))
311                 return err;
312         if (err)
313                 goto failure;
314
315         err = tcp_connect(sk);
316
317         if (err)
318                 goto failure;
319
320         return 0;
321
322 failure:
323         /*
324          * This unhashes the socket and releases the local port,
325          * if necessary.
326          */
327         tcp_set_state(sk, TCP_CLOSE);
328         ip_rt_put(rt);
329         sk->sk_route_caps = 0;
330         inet->inet_dport = 0;
331         return err;
332 }
333 EXPORT_SYMBOL(tcp_v4_connect);
334
335 /*
336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337  * It can be called through tcp_release_cb() if socket was owned by user
338  * at the time tcp_v4_err() was called to handle ICMP message.
339  */
340 void tcp_v4_mtu_reduced(struct sock *sk)
341 {
342         struct inet_sock *inet = inet_sk(sk);
343         struct dst_entry *dst;
344         u32 mtu;
345
346         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347                 return;
348         mtu = tcp_sk(sk)->mtu_info;
349         dst = inet_csk_update_pmtu(sk, mtu);
350         if (!dst)
351                 return;
352
353         /* Something is about to be wrong... Remember soft error
354          * for the case, if this connection will not able to recover.
355          */
356         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357                 sk->sk_err_soft = EMSGSIZE;
358
359         mtu = dst_mtu(dst);
360
361         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
362             ip_sk_accept_pmtu(sk) &&
363             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
364                 tcp_sync_mss(sk, mtu);
365
366                 /* Resend the TCP packet because it's
367                  * clear that the old packet has been
368                  * dropped. This is the new "fast" path mtu
369                  * discovery.
370                  */
371                 tcp_simple_retransmit(sk);
372         } /* else let the usual retransmit timer handle it */
373 }
374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
375
376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
377 {
378         struct dst_entry *dst = __sk_dst_check(sk, 0);
379
380         if (dst)
381                 dst->ops->redirect(dst, sk, skb);
382 }
383
384
385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
387 {
388         struct request_sock *req = inet_reqsk(sk);
389         struct net *net = sock_net(sk);
390
391         /* ICMPs are not backlogged, hence we cannot get
392          * an established socket here.
393          */
394         if (seq != tcp_rsk(req)->snt_isn) {
395                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
396         } else if (abort) {
397                 /*
398                  * Still in SYN_RECV, just remove it silently.
399                  * There is no good way to pass the error to the newly
400                  * created socket, and POSIX does not want network
401                  * errors returned from accept().
402                  */
403                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
404                 tcp_listendrop(req->rsk_listener);
405         }
406         reqsk_put(req);
407 }
408 EXPORT_SYMBOL(tcp_req_err);
409
410 /*
411  * This routine is called by the ICMP module when it gets some
412  * sort of error condition.  If err < 0 then the socket should
413  * be closed and the error returned to the user.  If err > 0
414  * it's just the icmp type << 8 | icmp code.  After adjustment
415  * header points to the first 8 bytes of the tcp header.  We need
416  * to find the appropriate port.
417  *
418  * The locking strategy used here is very "optimistic". When
419  * someone else accesses the socket the ICMP is just dropped
420  * and for some paths there is no check at all.
421  * A more general error queue to queue errors for later handling
422  * is probably better.
423  *
424  */
425
426 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
427 {
428         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
429         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
430         struct inet_connection_sock *icsk;
431         struct tcp_sock *tp;
432         struct inet_sock *inet;
433         const int type = icmp_hdr(icmp_skb)->type;
434         const int code = icmp_hdr(icmp_skb)->code;
435         struct sock *sk;
436         struct sk_buff *skb;
437         struct request_sock *fastopen;
438         u32 seq, snd_una;
439         s32 remaining;
440         u32 delta_us;
441         int err;
442         struct net *net = dev_net(icmp_skb->dev);
443
444         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445                                        th->dest, iph->saddr, ntohs(th->source),
446                                        inet_iif(icmp_skb), 0);
447         if (!sk) {
448                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
449                 return -ENOENT;
450         }
451         if (sk->sk_state == TCP_TIME_WAIT) {
452                 inet_twsk_put(inet_twsk(sk));
453                 return 0;
454         }
455         seq = ntohl(th->seq);
456         if (sk->sk_state == TCP_NEW_SYN_RECV) {
457                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
458                                      type == ICMP_TIME_EXCEEDED ||
459                                      (type == ICMP_DEST_UNREACH &&
460                                       (code == ICMP_NET_UNREACH ||
461                                        code == ICMP_HOST_UNREACH)));
462                 return 0;
463         }
464
465         bh_lock_sock(sk);
466         /* If too many ICMPs get dropped on busy
467          * servers this needs to be solved differently.
468          * We do take care of PMTU discovery (RFC1191) special case :
469          * we can receive locally generated ICMP messages while socket is held.
470          */
471         if (sock_owned_by_user(sk)) {
472                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
473                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
474         }
475         if (sk->sk_state == TCP_CLOSE)
476                 goto out;
477
478         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
479                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
480                 goto out;
481         }
482
483         icsk = inet_csk(sk);
484         tp = tcp_sk(sk);
485         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
486         fastopen = tp->fastopen_rsk;
487         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
488         if (sk->sk_state != TCP_LISTEN &&
489             !between(seq, snd_una, tp->snd_nxt)) {
490                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
491                 goto out;
492         }
493
494         switch (type) {
495         case ICMP_REDIRECT:
496                 if (!sock_owned_by_user(sk))
497                         do_redirect(icmp_skb, sk);
498                 goto out;
499         case ICMP_SOURCE_QUENCH:
500                 /* Just silently ignore these. */
501                 goto out;
502         case ICMP_PARAMETERPROB:
503                 err = EPROTO;
504                 break;
505         case ICMP_DEST_UNREACH:
506                 if (code > NR_ICMP_UNREACH)
507                         goto out;
508
509                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
510                         /* We are not interested in TCP_LISTEN and open_requests
511                          * (SYN-ACKs send out by Linux are always <576bytes so
512                          * they should go through unfragmented).
513                          */
514                         if (sk->sk_state == TCP_LISTEN)
515                                 goto out;
516
517                         tp->mtu_info = info;
518                         if (!sock_owned_by_user(sk)) {
519                                 tcp_v4_mtu_reduced(sk);
520                         } else {
521                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
522                                         sock_hold(sk);
523                         }
524                         goto out;
525                 }
526
527                 err = icmp_err_convert[code].errno;
528                 /* check if icmp_skb allows revert of backoff
529                  * (see draft-zimmermann-tcp-lcd) */
530                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
531                         break;
532                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
533                     !icsk->icsk_backoff || fastopen)
534                         break;
535
536                 if (sock_owned_by_user(sk))
537                         break;
538
539                 icsk->icsk_backoff--;
540                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541                                                TCP_TIMEOUT_INIT;
542                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
543
544                 skb = tcp_rtx_queue_head(sk);
545
546                 tcp_mstamp_refresh(tp);
547                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
548                 remaining = icsk->icsk_rto -
549                             usecs_to_jiffies(delta_us);
550
551                 if (remaining > 0) {
552                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
553                                                   remaining, TCP_RTO_MAX);
554                 } else {
555                         /* RTO revert clocked out retransmission.
556                          * Will retransmit now */
557                         tcp_retransmit_timer(sk);
558                 }
559
560                 break;
561         case ICMP_TIME_EXCEEDED:
562                 err = EHOSTUNREACH;
563                 break;
564         default:
565                 goto out;
566         }
567
568         switch (sk->sk_state) {
569         case TCP_SYN_SENT:
570         case TCP_SYN_RECV:
571                 /* Only in fast or simultaneous open. If a fast open socket is
572                  * is already accepted it is treated as a connected one below.
573                  */
574                 if (fastopen && !fastopen->sk)
575                         break;
576
577                 if (!sock_owned_by_user(sk)) {
578                         sk->sk_err = err;
579
580                         sk->sk_error_report(sk);
581
582                         tcp_done(sk);
583                 } else {
584                         sk->sk_err_soft = err;
585                 }
586                 goto out;
587         }
588
589         /* If we've already connected we will keep trying
590          * until we time out, or the user gives up.
591          *
592          * rfc1122 4.2.3.9 allows to consider as hard errors
593          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
594          * but it is obsoleted by pmtu discovery).
595          *
596          * Note, that in modern internet, where routing is unreliable
597          * and in each dark corner broken firewalls sit, sending random
598          * errors ordered by their masters even this two messages finally lose
599          * their original sense (even Linux sends invalid PORT_UNREACHs)
600          *
601          * Now we are in compliance with RFCs.
602          *                                                      --ANK (980905)
603          */
604
605         inet = inet_sk(sk);
606         if (!sock_owned_by_user(sk) && inet->recverr) {
607                 sk->sk_err = err;
608                 sk->sk_error_report(sk);
609         } else  { /* Only an error on timeout */
610                 sk->sk_err_soft = err;
611         }
612
613 out:
614         bh_unlock_sock(sk);
615         sock_put(sk);
616         return 0;
617 }
618
619 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
620 {
621         struct tcphdr *th = tcp_hdr(skb);
622
623         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
624         skb->csum_start = skb_transport_header(skb) - skb->head;
625         skb->csum_offset = offsetof(struct tcphdr, check);
626 }
627
628 /* This routine computes an IPv4 TCP checksum. */
629 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
630 {
631         const struct inet_sock *inet = inet_sk(sk);
632
633         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
634 }
635 EXPORT_SYMBOL(tcp_v4_send_check);
636
637 /*
638  *      This routine will send an RST to the other tcp.
639  *
640  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
641  *                    for reset.
642  *      Answer: if a packet caused RST, it is not for a socket
643  *              existing in our system, if it is matched to a socket,
644  *              it is just duplicate segment or bug in other side's TCP.
645  *              So that we build reply only basing on parameters
646  *              arrived with segment.
647  *      Exception: precedence violation. We do not implement it in any case.
648  */
649
650 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
651 {
652         const struct tcphdr *th = tcp_hdr(skb);
653         struct {
654                 struct tcphdr th;
655 #ifdef CONFIG_TCP_MD5SIG
656                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
657 #endif
658         } rep;
659         struct ip_reply_arg arg;
660 #ifdef CONFIG_TCP_MD5SIG
661         struct tcp_md5sig_key *key = NULL;
662         const __u8 *hash_location = NULL;
663         unsigned char newhash[16];
664         int genhash;
665         struct sock *sk1 = NULL;
666 #endif
667         struct net *net;
668         struct sock *ctl_sk;
669
670         /* Never send a reset in response to a reset. */
671         if (th->rst)
672                 return;
673
674         /* If sk not NULL, it means we did a successful lookup and incoming
675          * route had to be correct. prequeue might have dropped our dst.
676          */
677         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
678                 return;
679
680         /* Swap the send and the receive. */
681         memset(&rep, 0, sizeof(rep));
682         rep.th.dest   = th->source;
683         rep.th.source = th->dest;
684         rep.th.doff   = sizeof(struct tcphdr) / 4;
685         rep.th.rst    = 1;
686
687         if (th->ack) {
688                 rep.th.seq = th->ack_seq;
689         } else {
690                 rep.th.ack = 1;
691                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692                                        skb->len - (th->doff << 2));
693         }
694
695         memset(&arg, 0, sizeof(arg));
696         arg.iov[0].iov_base = (unsigned char *)&rep;
697         arg.iov[0].iov_len  = sizeof(rep.th);
698
699         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
700 #ifdef CONFIG_TCP_MD5SIG
701         rcu_read_lock();
702         hash_location = tcp_parse_md5sig_option(th);
703         if (sk && sk_fullsock(sk)) {
704                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
705                                         &ip_hdr(skb)->saddr, AF_INET);
706         } else if (hash_location) {
707                 /*
708                  * active side is lost. Try to find listening socket through
709                  * source port, and then find md5 key through listening socket.
710                  * we are not loose security here:
711                  * Incoming packet is checked with md5 hash with finding key,
712                  * no RST generated if md5 hash doesn't match.
713                  */
714                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
715                                              ip_hdr(skb)->saddr,
716                                              th->source, ip_hdr(skb)->daddr,
717                                              ntohs(th->source), inet_iif(skb),
718                                              tcp_v4_sdif(skb));
719                 /* don't send rst if it can't find key */
720                 if (!sk1)
721                         goto out;
722
723                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
724                                         &ip_hdr(skb)->saddr, AF_INET);
725                 if (!key)
726                         goto out;
727
728
729                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
730                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
731                         goto out;
732
733         }
734
735         if (key) {
736                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
737                                    (TCPOPT_NOP << 16) |
738                                    (TCPOPT_MD5SIG << 8) |
739                                    TCPOLEN_MD5SIG);
740                 /* Update length and the length the header thinks exists */
741                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
742                 rep.th.doff = arg.iov[0].iov_len / 4;
743
744                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
745                                      key, ip_hdr(skb)->saddr,
746                                      ip_hdr(skb)->daddr, &rep.th);
747         }
748 #endif
749         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
750                                       ip_hdr(skb)->saddr, /* XXX */
751                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
752         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
753         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
754
755         /* When socket is gone, all binding information is lost.
756          * routing might fail in this case. No choice here, if we choose to force
757          * input interface, we will misroute in case of asymmetric route.
758          */
759         if (sk) {
760                 arg.bound_dev_if = sk->sk_bound_dev_if;
761                 if (sk_fullsock(sk))
762                         trace_tcp_send_reset(sk, skb);
763         }
764
765         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
766                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
767
768         arg.tos = ip_hdr(skb)->tos;
769         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
770         local_bh_disable();
771         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
772         if (sk)
773                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
774                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
775         ip_send_unicast_reply(ctl_sk,
776                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
777                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778                               &arg, arg.iov[0].iov_len);
779
780         ctl_sk->sk_mark = 0;
781         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
782         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
783         local_bh_enable();
784
785 #ifdef CONFIG_TCP_MD5SIG
786 out:
787         rcu_read_unlock();
788 #endif
789 }
790
791 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
792    outside socket context is ugly, certainly. What can I do?
793  */
794
795 static void tcp_v4_send_ack(const struct sock *sk,
796                             struct sk_buff *skb, u32 seq, u32 ack,
797                             u32 win, u32 tsval, u32 tsecr, int oif,
798                             struct tcp_md5sig_key *key,
799                             int reply_flags, u8 tos)
800 {
801         const struct tcphdr *th = tcp_hdr(skb);
802         struct {
803                 struct tcphdr th;
804                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
805 #ifdef CONFIG_TCP_MD5SIG
806                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
807 #endif
808                         ];
809         } rep;
810         struct net *net = sock_net(sk);
811         struct ip_reply_arg arg;
812         struct sock *ctl_sk;
813
814         memset(&rep.th, 0, sizeof(struct tcphdr));
815         memset(&arg, 0, sizeof(arg));
816
817         arg.iov[0].iov_base = (unsigned char *)&rep;
818         arg.iov[0].iov_len  = sizeof(rep.th);
819         if (tsecr) {
820                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
821                                    (TCPOPT_TIMESTAMP << 8) |
822                                    TCPOLEN_TIMESTAMP);
823                 rep.opt[1] = htonl(tsval);
824                 rep.opt[2] = htonl(tsecr);
825                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
826         }
827
828         /* Swap the send and the receive. */
829         rep.th.dest    = th->source;
830         rep.th.source  = th->dest;
831         rep.th.doff    = arg.iov[0].iov_len / 4;
832         rep.th.seq     = htonl(seq);
833         rep.th.ack_seq = htonl(ack);
834         rep.th.ack     = 1;
835         rep.th.window  = htons(win);
836
837 #ifdef CONFIG_TCP_MD5SIG
838         if (key) {
839                 int offset = (tsecr) ? 3 : 0;
840
841                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
842                                           (TCPOPT_NOP << 16) |
843                                           (TCPOPT_MD5SIG << 8) |
844                                           TCPOLEN_MD5SIG);
845                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
846                 rep.th.doff = arg.iov[0].iov_len/4;
847
848                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
849                                     key, ip_hdr(skb)->saddr,
850                                     ip_hdr(skb)->daddr, &rep.th);
851         }
852 #endif
853         arg.flags = reply_flags;
854         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
855                                       ip_hdr(skb)->saddr, /* XXX */
856                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
857         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
858         if (oif)
859                 arg.bound_dev_if = oif;
860         arg.tos = tos;
861         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
862         local_bh_disable();
863         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
864         if (sk)
865                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
866                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
867         ip_send_unicast_reply(ctl_sk,
868                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
869                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
870                               &arg, arg.iov[0].iov_len);
871
872         ctl_sk->sk_mark = 0;
873         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
874         local_bh_enable();
875 }
876
877 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
878 {
879         struct inet_timewait_sock *tw = inet_twsk(sk);
880         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
881
882         tcp_v4_send_ack(sk, skb,
883                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
884                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
885                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
886                         tcptw->tw_ts_recent,
887                         tw->tw_bound_dev_if,
888                         tcp_twsk_md5_key(tcptw),
889                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
890                         tw->tw_tos
891                         );
892
893         inet_twsk_put(tw);
894 }
895
896 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
897                                   struct request_sock *req)
898 {
899         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
900          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
901          */
902         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
903                                              tcp_sk(sk)->snd_nxt;
904
905         /* RFC 7323 2.3
906          * The window field (SEG.WND) of every outgoing segment, with the
907          * exception of <SYN> segments, MUST be right-shifted by
908          * Rcv.Wind.Shift bits:
909          */
910         tcp_v4_send_ack(sk, skb, seq,
911                         tcp_rsk(req)->rcv_nxt,
912                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
913                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
914                         req->ts_recent,
915                         0,
916                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
917                                           AF_INET),
918                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
919                         ip_hdr(skb)->tos);
920 }
921
922 /*
923  *      Send a SYN-ACK after having received a SYN.
924  *      This still operates on a request_sock only, not on a big
925  *      socket.
926  */
927 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
928                               struct flowi *fl,
929                               struct request_sock *req,
930                               struct tcp_fastopen_cookie *foc,
931                               enum tcp_synack_type synack_type)
932 {
933         const struct inet_request_sock *ireq = inet_rsk(req);
934         struct flowi4 fl4;
935         int err = -1;
936         struct sk_buff *skb;
937
938         /* First, grab a route. */
939         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
940                 return -1;
941
942         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
943
944         if (skb) {
945                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
946
947                 rcu_read_lock();
948                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
949                                             ireq->ir_rmt_addr,
950                                             rcu_dereference(ireq->ireq_opt));
951                 rcu_read_unlock();
952                 err = net_xmit_eval(err);
953         }
954
955         return err;
956 }
957
958 /*
959  *      IPv4 request_sock destructor.
960  */
961 static void tcp_v4_reqsk_destructor(struct request_sock *req)
962 {
963         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
964 }
965
966 #ifdef CONFIG_TCP_MD5SIG
967 /*
968  * RFC2385 MD5 checksumming requires a mapping of
969  * IP address->MD5 Key.
970  * We need to maintain these in the sk structure.
971  */
972
973 /* Find the Key structure for an address.  */
974 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
975                                          const union tcp_md5_addr *addr,
976                                          int family)
977 {
978         const struct tcp_sock *tp = tcp_sk(sk);
979         struct tcp_md5sig_key *key;
980         const struct tcp_md5sig_info *md5sig;
981         __be32 mask;
982         struct tcp_md5sig_key *best_match = NULL;
983         bool match;
984
985         /* caller either holds rcu_read_lock() or socket lock */
986         md5sig = rcu_dereference_check(tp->md5sig_info,
987                                        lockdep_sock_is_held(sk));
988         if (!md5sig)
989                 return NULL;
990
991         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
992                 if (key->family != family)
993                         continue;
994
995                 if (family == AF_INET) {
996                         mask = inet_make_mask(key->prefixlen);
997                         match = (key->addr.a4.s_addr & mask) ==
998                                 (addr->a4.s_addr & mask);
999 #if IS_ENABLED(CONFIG_IPV6)
1000                 } else if (family == AF_INET6) {
1001                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1002                                                   key->prefixlen);
1003 #endif
1004                 } else {
1005                         match = false;
1006                 }
1007
1008                 if (match && (!best_match ||
1009                               key->prefixlen > best_match->prefixlen))
1010                         best_match = key;
1011         }
1012         return best_match;
1013 }
1014 EXPORT_SYMBOL(tcp_md5_do_lookup);
1015
1016 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1017                                                       const union tcp_md5_addr *addr,
1018                                                       int family, u8 prefixlen)
1019 {
1020         const struct tcp_sock *tp = tcp_sk(sk);
1021         struct tcp_md5sig_key *key;
1022         unsigned int size = sizeof(struct in_addr);
1023         const struct tcp_md5sig_info *md5sig;
1024
1025         /* caller either holds rcu_read_lock() or socket lock */
1026         md5sig = rcu_dereference_check(tp->md5sig_info,
1027                                        lockdep_sock_is_held(sk));
1028         if (!md5sig)
1029                 return NULL;
1030 #if IS_ENABLED(CONFIG_IPV6)
1031         if (family == AF_INET6)
1032                 size = sizeof(struct in6_addr);
1033 #endif
1034         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1035                 if (key->family != family)
1036                         continue;
1037                 if (!memcmp(&key->addr, addr, size) &&
1038                     key->prefixlen == prefixlen)
1039                         return key;
1040         }
1041         return NULL;
1042 }
1043
1044 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1045                                          const struct sock *addr_sk)
1046 {
1047         const union tcp_md5_addr *addr;
1048
1049         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1050         return tcp_md5_do_lookup(sk, addr, AF_INET);
1051 }
1052 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1053
1054 /* This can be called on a newly created socket, from other files */
1055 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1056                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1057                    gfp_t gfp)
1058 {
1059         /* Add Key to the list */
1060         struct tcp_md5sig_key *key;
1061         struct tcp_sock *tp = tcp_sk(sk);
1062         struct tcp_md5sig_info *md5sig;
1063
1064         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1065         if (key) {
1066                 /* Pre-existing entry - just update that one. */
1067                 memcpy(key->key, newkey, newkeylen);
1068                 key->keylen = newkeylen;
1069                 return 0;
1070         }
1071
1072         md5sig = rcu_dereference_protected(tp->md5sig_info,
1073                                            lockdep_sock_is_held(sk));
1074         if (!md5sig) {
1075                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1076                 if (!md5sig)
1077                         return -ENOMEM;
1078
1079                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1080                 INIT_HLIST_HEAD(&md5sig->head);
1081                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1082         }
1083
1084         key = sock_kmalloc(sk, sizeof(*key), gfp);
1085         if (!key)
1086                 return -ENOMEM;
1087         if (!tcp_alloc_md5sig_pool()) {
1088                 sock_kfree_s(sk, key, sizeof(*key));
1089                 return -ENOMEM;
1090         }
1091
1092         memcpy(key->key, newkey, newkeylen);
1093         key->keylen = newkeylen;
1094         key->family = family;
1095         key->prefixlen = prefixlen;
1096         memcpy(&key->addr, addr,
1097                (family == AF_INET6) ? sizeof(struct in6_addr) :
1098                                       sizeof(struct in_addr));
1099         hlist_add_head_rcu(&key->node, &md5sig->head);
1100         return 0;
1101 }
1102 EXPORT_SYMBOL(tcp_md5_do_add);
1103
1104 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1105                    u8 prefixlen)
1106 {
1107         struct tcp_md5sig_key *key;
1108
1109         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1110         if (!key)
1111                 return -ENOENT;
1112         hlist_del_rcu(&key->node);
1113         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1114         kfree_rcu(key, rcu);
1115         return 0;
1116 }
1117 EXPORT_SYMBOL(tcp_md5_do_del);
1118
1119 static void tcp_clear_md5_list(struct sock *sk)
1120 {
1121         struct tcp_sock *tp = tcp_sk(sk);
1122         struct tcp_md5sig_key *key;
1123         struct hlist_node *n;
1124         struct tcp_md5sig_info *md5sig;
1125
1126         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1127
1128         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1129                 hlist_del_rcu(&key->node);
1130                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1131                 kfree_rcu(key, rcu);
1132         }
1133 }
1134
1135 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1136                                  char __user *optval, int optlen)
1137 {
1138         struct tcp_md5sig cmd;
1139         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1140         u8 prefixlen = 32;
1141
1142         if (optlen < sizeof(cmd))
1143                 return -EINVAL;
1144
1145         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1146                 return -EFAULT;
1147
1148         if (sin->sin_family != AF_INET)
1149                 return -EINVAL;
1150
1151         if (optname == TCP_MD5SIG_EXT &&
1152             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1153                 prefixlen = cmd.tcpm_prefixlen;
1154                 if (prefixlen > 32)
1155                         return -EINVAL;
1156         }
1157
1158         if (!cmd.tcpm_keylen)
1159                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1160                                       AF_INET, prefixlen);
1161
1162         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1163                 return -EINVAL;
1164
1165         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1166                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1167                               GFP_KERNEL);
1168 }
1169
1170 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1171                                    __be32 daddr, __be32 saddr,
1172                                    const struct tcphdr *th, int nbytes)
1173 {
1174         struct tcp4_pseudohdr *bp;
1175         struct scatterlist sg;
1176         struct tcphdr *_th;
1177
1178         bp = hp->scratch;
1179         bp->saddr = saddr;
1180         bp->daddr = daddr;
1181         bp->pad = 0;
1182         bp->protocol = IPPROTO_TCP;
1183         bp->len = cpu_to_be16(nbytes);
1184
1185         _th = (struct tcphdr *)(bp + 1);
1186         memcpy(_th, th, sizeof(*th));
1187         _th->check = 0;
1188
1189         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1190         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1191                                 sizeof(*bp) + sizeof(*th));
1192         return crypto_ahash_update(hp->md5_req);
1193 }
1194
1195 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1196                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1197 {
1198         struct tcp_md5sig_pool *hp;
1199         struct ahash_request *req;
1200
1201         hp = tcp_get_md5sig_pool();
1202         if (!hp)
1203                 goto clear_hash_noput;
1204         req = hp->md5_req;
1205
1206         if (crypto_ahash_init(req))
1207                 goto clear_hash;
1208         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1209                 goto clear_hash;
1210         if (tcp_md5_hash_key(hp, key))
1211                 goto clear_hash;
1212         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1213         if (crypto_ahash_final(req))
1214                 goto clear_hash;
1215
1216         tcp_put_md5sig_pool();
1217         return 0;
1218
1219 clear_hash:
1220         tcp_put_md5sig_pool();
1221 clear_hash_noput:
1222         memset(md5_hash, 0, 16);
1223         return 1;
1224 }
1225
1226 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1227                         const struct sock *sk,
1228                         const struct sk_buff *skb)
1229 {
1230         struct tcp_md5sig_pool *hp;
1231         struct ahash_request *req;
1232         const struct tcphdr *th = tcp_hdr(skb);
1233         __be32 saddr, daddr;
1234
1235         if (sk) { /* valid for establish/request sockets */
1236                 saddr = sk->sk_rcv_saddr;
1237                 daddr = sk->sk_daddr;
1238         } else {
1239                 const struct iphdr *iph = ip_hdr(skb);
1240                 saddr = iph->saddr;
1241                 daddr = iph->daddr;
1242         }
1243
1244         hp = tcp_get_md5sig_pool();
1245         if (!hp)
1246                 goto clear_hash_noput;
1247         req = hp->md5_req;
1248
1249         if (crypto_ahash_init(req))
1250                 goto clear_hash;
1251
1252         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1253                 goto clear_hash;
1254         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1255                 goto clear_hash;
1256         if (tcp_md5_hash_key(hp, key))
1257                 goto clear_hash;
1258         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1259         if (crypto_ahash_final(req))
1260                 goto clear_hash;
1261
1262         tcp_put_md5sig_pool();
1263         return 0;
1264
1265 clear_hash:
1266         tcp_put_md5sig_pool();
1267 clear_hash_noput:
1268         memset(md5_hash, 0, 16);
1269         return 1;
1270 }
1271 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1272
1273 #endif
1274
1275 /* Called with rcu_read_lock() */
1276 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1277                                     const struct sk_buff *skb)
1278 {
1279 #ifdef CONFIG_TCP_MD5SIG
1280         /*
1281          * This gets called for each TCP segment that arrives
1282          * so we want to be efficient.
1283          * We have 3 drop cases:
1284          * o No MD5 hash and one expected.
1285          * o MD5 hash and we're not expecting one.
1286          * o MD5 hash and its wrong.
1287          */
1288         const __u8 *hash_location = NULL;
1289         struct tcp_md5sig_key *hash_expected;
1290         const struct iphdr *iph = ip_hdr(skb);
1291         const struct tcphdr *th = tcp_hdr(skb);
1292         int genhash;
1293         unsigned char newhash[16];
1294
1295         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1296                                           AF_INET);
1297         hash_location = tcp_parse_md5sig_option(th);
1298
1299         /* We've parsed the options - do we have a hash? */
1300         if (!hash_expected && !hash_location)
1301                 return false;
1302
1303         if (hash_expected && !hash_location) {
1304                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1305                 return true;
1306         }
1307
1308         if (!hash_expected && hash_location) {
1309                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1310                 return true;
1311         }
1312
1313         /* Okay, so this is hash_expected and hash_location -
1314          * so we need to calculate the checksum.
1315          */
1316         genhash = tcp_v4_md5_hash_skb(newhash,
1317                                       hash_expected,
1318                                       NULL, skb);
1319
1320         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1321                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1322                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1323                                      &iph->saddr, ntohs(th->source),
1324                                      &iph->daddr, ntohs(th->dest),
1325                                      genhash ? " tcp_v4_calc_md5_hash failed"
1326                                      : "");
1327                 return true;
1328         }
1329         return false;
1330 #endif
1331         return false;
1332 }
1333
1334 static void tcp_v4_init_req(struct request_sock *req,
1335                             const struct sock *sk_listener,
1336                             struct sk_buff *skb)
1337 {
1338         struct inet_request_sock *ireq = inet_rsk(req);
1339         struct net *net = sock_net(sk_listener);
1340
1341         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1342         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1343         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1344 }
1345
1346 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1347                                           struct flowi *fl,
1348                                           const struct request_sock *req)
1349 {
1350         return inet_csk_route_req(sk, &fl->u.ip4, req);
1351 }
1352
1353 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1354         .family         =       PF_INET,
1355         .obj_size       =       sizeof(struct tcp_request_sock),
1356         .rtx_syn_ack    =       tcp_rtx_synack,
1357         .send_ack       =       tcp_v4_reqsk_send_ack,
1358         .destructor     =       tcp_v4_reqsk_destructor,
1359         .send_reset     =       tcp_v4_send_reset,
1360         .syn_ack_timeout =      tcp_syn_ack_timeout,
1361 };
1362
1363 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1364         .mss_clamp      =       TCP_MSS_DEFAULT,
1365 #ifdef CONFIG_TCP_MD5SIG
1366         .req_md5_lookup =       tcp_v4_md5_lookup,
1367         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1368 #endif
1369         .init_req       =       tcp_v4_init_req,
1370 #ifdef CONFIG_SYN_COOKIES
1371         .cookie_init_seq =      cookie_v4_init_sequence,
1372 #endif
1373         .route_req      =       tcp_v4_route_req,
1374         .init_seq       =       tcp_v4_init_seq,
1375         .init_ts_off    =       tcp_v4_init_ts_off,
1376         .send_synack    =       tcp_v4_send_synack,
1377 };
1378
1379 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1380 {
1381         /* Never answer to SYNs send to broadcast or multicast */
1382         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1383                 goto drop;
1384
1385         return tcp_conn_request(&tcp_request_sock_ops,
1386                                 &tcp_request_sock_ipv4_ops, sk, skb);
1387
1388 drop:
1389         tcp_listendrop(sk);
1390         return 0;
1391 }
1392 EXPORT_SYMBOL(tcp_v4_conn_request);
1393
1394
1395 /*
1396  * The three way handshake has completed - we got a valid synack -
1397  * now create the new socket.
1398  */
1399 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1400                                   struct request_sock *req,
1401                                   struct dst_entry *dst,
1402                                   struct request_sock *req_unhash,
1403                                   bool *own_req)
1404 {
1405         struct inet_request_sock *ireq;
1406         struct inet_sock *newinet;
1407         struct tcp_sock *newtp;
1408         struct sock *newsk;
1409 #ifdef CONFIG_TCP_MD5SIG
1410         struct tcp_md5sig_key *key;
1411 #endif
1412         struct ip_options_rcu *inet_opt;
1413
1414         if (sk_acceptq_is_full(sk))
1415                 goto exit_overflow;
1416
1417         newsk = tcp_create_openreq_child(sk, req, skb);
1418         if (!newsk)
1419                 goto exit_nonewsk;
1420
1421         newsk->sk_gso_type = SKB_GSO_TCPV4;
1422         inet_sk_rx_dst_set(newsk, skb);
1423
1424         newtp                 = tcp_sk(newsk);
1425         newinet               = inet_sk(newsk);
1426         ireq                  = inet_rsk(req);
1427         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1428         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1429         newsk->sk_bound_dev_if = ireq->ir_iif;
1430         newinet->inet_saddr   = ireq->ir_loc_addr;
1431         inet_opt              = rcu_dereference(ireq->ireq_opt);
1432         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1433         newinet->mc_index     = inet_iif(skb);
1434         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1435         newinet->rcv_tos      = ip_hdr(skb)->tos;
1436         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1437         if (inet_opt)
1438                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1439         newinet->inet_id = newtp->write_seq ^ jiffies;
1440
1441         if (!dst) {
1442                 dst = inet_csk_route_child_sock(sk, newsk, req);
1443                 if (!dst)
1444                         goto put_and_exit;
1445         } else {
1446                 /* syncookie case : see end of cookie_v4_check() */
1447         }
1448         sk_setup_caps(newsk, dst);
1449
1450         tcp_ca_openreq_child(newsk, dst);
1451
1452         tcp_sync_mss(newsk, dst_mtu(dst));
1453         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1454
1455         tcp_initialize_rcv_mss(newsk);
1456
1457 #ifdef CONFIG_TCP_MD5SIG
1458         /* Copy over the MD5 key from the original socket */
1459         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1460                                 AF_INET);
1461         if (key) {
1462                 /*
1463                  * We're using one, so create a matching key
1464                  * on the newsk structure. If we fail to get
1465                  * memory, then we end up not copying the key
1466                  * across. Shucks.
1467                  */
1468                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1469                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1470                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1471         }
1472 #endif
1473
1474         if (__inet_inherit_port(sk, newsk) < 0)
1475                 goto put_and_exit;
1476         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1477         if (likely(*own_req)) {
1478                 tcp_move_syn(newtp, req);
1479                 ireq->ireq_opt = NULL;
1480         } else {
1481                 newinet->inet_opt = NULL;
1482         }
1483         return newsk;
1484
1485 exit_overflow:
1486         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1487 exit_nonewsk:
1488         dst_release(dst);
1489 exit:
1490         tcp_listendrop(sk);
1491         return NULL;
1492 put_and_exit:
1493         newinet->inet_opt = NULL;
1494         inet_csk_prepare_forced_close(newsk);
1495         tcp_done(newsk);
1496         goto exit;
1497 }
1498 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1499
1500 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1501 {
1502 #ifdef CONFIG_SYN_COOKIES
1503         const struct tcphdr *th = tcp_hdr(skb);
1504
1505         if (!th->syn)
1506                 sk = cookie_v4_check(sk, skb);
1507 #endif
1508         return sk;
1509 }
1510
1511 /* The socket must have it's spinlock held when we get
1512  * here, unless it is a TCP_LISTEN socket.
1513  *
1514  * We have a potential double-lock case here, so even when
1515  * doing backlog processing we use the BH locking scheme.
1516  * This is because we cannot sleep with the original spinlock
1517  * held.
1518  */
1519 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1520 {
1521         struct sock *rsk;
1522
1523         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1524                 struct dst_entry *dst = sk->sk_rx_dst;
1525
1526                 sock_rps_save_rxhash(sk, skb);
1527                 sk_mark_napi_id(sk, skb);
1528                 if (dst) {
1529                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1530                             !dst->ops->check(dst, 0)) {
1531                                 dst_release(dst);
1532                                 sk->sk_rx_dst = NULL;
1533                         }
1534                 }
1535                 tcp_rcv_established(sk, skb);
1536                 return 0;
1537         }
1538
1539         if (tcp_checksum_complete(skb))
1540                 goto csum_err;
1541
1542         if (sk->sk_state == TCP_LISTEN) {
1543                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1544
1545                 if (!nsk)
1546                         goto discard;
1547                 if (nsk != sk) {
1548                         if (tcp_child_process(sk, nsk, skb)) {
1549                                 rsk = nsk;
1550                                 goto reset;
1551                         }
1552                         return 0;
1553                 }
1554         } else
1555                 sock_rps_save_rxhash(sk, skb);
1556
1557         if (tcp_rcv_state_process(sk, skb)) {
1558                 rsk = sk;
1559                 goto reset;
1560         }
1561         return 0;
1562
1563 reset:
1564         tcp_v4_send_reset(rsk, skb);
1565 discard:
1566         kfree_skb(skb);
1567         /* Be careful here. If this function gets more complicated and
1568          * gcc suffers from register pressure on the x86, sk (in %ebx)
1569          * might be destroyed here. This current version compiles correctly,
1570          * but you have been warned.
1571          */
1572         return 0;
1573
1574 csum_err:
1575         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1576         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1577         goto discard;
1578 }
1579 EXPORT_SYMBOL(tcp_v4_do_rcv);
1580
1581 int tcp_v4_early_demux(struct sk_buff *skb)
1582 {
1583         const struct iphdr *iph;
1584         const struct tcphdr *th;
1585         struct sock *sk;
1586
1587         if (skb->pkt_type != PACKET_HOST)
1588                 return 0;
1589
1590         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1591                 return 0;
1592
1593         iph = ip_hdr(skb);
1594         th = tcp_hdr(skb);
1595
1596         if (th->doff < sizeof(struct tcphdr) / 4)
1597                 return 0;
1598
1599         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1600                                        iph->saddr, th->source,
1601                                        iph->daddr, ntohs(th->dest),
1602                                        skb->skb_iif, inet_sdif(skb));
1603         if (sk) {
1604                 skb->sk = sk;
1605                 skb->destructor = sock_edemux;
1606                 if (sk_fullsock(sk)) {
1607                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1608
1609                         if (dst)
1610                                 dst = dst_check(dst, 0);
1611                         if (dst &&
1612                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1613                                 skb_dst_set_noref(skb, dst);
1614                 }
1615         }
1616         return 0;
1617 }
1618
1619 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1620 {
1621         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1622
1623         /* Only socket owner can try to collapse/prune rx queues
1624          * to reduce memory overhead, so add a little headroom here.
1625          * Few sockets backlog are possibly concurrently non empty.
1626          */
1627         limit += 64*1024;
1628
1629         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1630          * we can fix skb->truesize to its real value to avoid future drops.
1631          * This is valid because skb is not yet charged to the socket.
1632          * It has been noticed pure SACK packets were sometimes dropped
1633          * (if cooked by drivers without copybreak feature).
1634          */
1635         skb_condense(skb);
1636
1637         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1638                 bh_unlock_sock(sk);
1639                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1640                 return true;
1641         }
1642         return false;
1643 }
1644 EXPORT_SYMBOL(tcp_add_backlog);
1645
1646 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1647 {
1648         struct tcphdr *th = (struct tcphdr *)skb->data;
1649         unsigned int eaten = skb->len;
1650         int err;
1651
1652         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1653         if (!err) {
1654                 eaten -= skb->len;
1655                 TCP_SKB_CB(skb)->end_seq -= eaten;
1656         }
1657         return err;
1658 }
1659 EXPORT_SYMBOL(tcp_filter);
1660
1661 static void tcp_v4_restore_cb(struct sk_buff *skb)
1662 {
1663         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1664                 sizeof(struct inet_skb_parm));
1665 }
1666
1667 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1668                            const struct tcphdr *th)
1669 {
1670         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1671          * barrier() makes sure compiler wont play fool^Waliasing games.
1672          */
1673         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1674                 sizeof(struct inet_skb_parm));
1675         barrier();
1676
1677         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1678         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1679                                     skb->len - th->doff * 4);
1680         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1681         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1682         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1683         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1684         TCP_SKB_CB(skb)->sacked  = 0;
1685         TCP_SKB_CB(skb)->has_rxtstamp =
1686                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1687 }
1688
1689 /*
1690  *      From tcp_input.c
1691  */
1692
1693 int tcp_v4_rcv(struct sk_buff *skb)
1694 {
1695         struct net *net = dev_net(skb->dev);
1696         int sdif = inet_sdif(skb);
1697         const struct iphdr *iph;
1698         const struct tcphdr *th;
1699         bool refcounted;
1700         struct sock *sk;
1701         int ret;
1702
1703         if (skb->pkt_type != PACKET_HOST)
1704                 goto discard_it;
1705
1706         /* Count it even if it's bad */
1707         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1708
1709         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1710                 goto discard_it;
1711
1712         th = (const struct tcphdr *)skb->data;
1713
1714         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1715                 goto bad_packet;
1716         if (!pskb_may_pull(skb, th->doff * 4))
1717                 goto discard_it;
1718
1719         /* An explanation is required here, I think.
1720          * Packet length and doff are validated by header prediction,
1721          * provided case of th->doff==0 is eliminated.
1722          * So, we defer the checks. */
1723
1724         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1725                 goto csum_error;
1726
1727         th = (const struct tcphdr *)skb->data;
1728         iph = ip_hdr(skb);
1729 lookup:
1730         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1731                                th->dest, sdif, &refcounted);
1732         if (!sk)
1733                 goto no_tcp_socket;
1734
1735 process:
1736         if (sk->sk_state == TCP_TIME_WAIT)
1737                 goto do_time_wait;
1738
1739         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1740                 struct request_sock *req = inet_reqsk(sk);
1741                 bool req_stolen = false;
1742                 struct sock *nsk;
1743
1744                 sk = req->rsk_listener;
1745                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1746                         sk_drops_add(sk, skb);
1747                         reqsk_put(req);
1748                         goto discard_it;
1749                 }
1750                 if (tcp_checksum_complete(skb)) {
1751                         reqsk_put(req);
1752                         goto csum_error;
1753                 }
1754                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1755                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1756                         goto lookup;
1757                 }
1758                 /* We own a reference on the listener, increase it again
1759                  * as we might lose it too soon.
1760                  */
1761                 sock_hold(sk);
1762                 refcounted = true;
1763                 nsk = NULL;
1764                 if (!tcp_filter(sk, skb)) {
1765                         th = (const struct tcphdr *)skb->data;
1766                         iph = ip_hdr(skb);
1767                         tcp_v4_fill_cb(skb, iph, th);
1768                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1769                 }
1770                 if (!nsk) {
1771                         reqsk_put(req);
1772                         if (req_stolen) {
1773                                 /* Another cpu got exclusive access to req
1774                                  * and created a full blown socket.
1775                                  * Try to feed this packet to this socket
1776                                  * instead of discarding it.
1777                                  */
1778                                 tcp_v4_restore_cb(skb);
1779                                 sock_put(sk);
1780                                 goto lookup;
1781                         }
1782                         goto discard_and_relse;
1783                 }
1784                 if (nsk == sk) {
1785                         reqsk_put(req);
1786                         tcp_v4_restore_cb(skb);
1787                 } else if (tcp_child_process(sk, nsk, skb)) {
1788                         tcp_v4_send_reset(nsk, skb);
1789                         goto discard_and_relse;
1790                 } else {
1791                         sock_put(sk);
1792                         return 0;
1793                 }
1794         }
1795         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1796                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1797                 goto discard_and_relse;
1798         }
1799
1800         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1801                 goto discard_and_relse;
1802
1803         if (tcp_v4_inbound_md5_hash(sk, skb))
1804                 goto discard_and_relse;
1805
1806         nf_reset(skb);
1807
1808         if (tcp_filter(sk, skb))
1809                 goto discard_and_relse;
1810         th = (const struct tcphdr *)skb->data;
1811         iph = ip_hdr(skb);
1812         tcp_v4_fill_cb(skb, iph, th);
1813
1814         skb->dev = NULL;
1815
1816         if (sk->sk_state == TCP_LISTEN) {
1817                 ret = tcp_v4_do_rcv(sk, skb);
1818                 goto put_and_return;
1819         }
1820
1821         sk_incoming_cpu_update(sk);
1822
1823         bh_lock_sock_nested(sk);
1824         tcp_segs_in(tcp_sk(sk), skb);
1825         ret = 0;
1826         if (!sock_owned_by_user(sk)) {
1827                 ret = tcp_v4_do_rcv(sk, skb);
1828         } else if (tcp_add_backlog(sk, skb)) {
1829                 goto discard_and_relse;
1830         }
1831         bh_unlock_sock(sk);
1832
1833 put_and_return:
1834         if (refcounted)
1835                 sock_put(sk);
1836
1837         return ret;
1838
1839 no_tcp_socket:
1840         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1841                 goto discard_it;
1842
1843         tcp_v4_fill_cb(skb, iph, th);
1844
1845         if (tcp_checksum_complete(skb)) {
1846 csum_error:
1847                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1848 bad_packet:
1849                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1850         } else {
1851                 tcp_v4_send_reset(NULL, skb);
1852         }
1853
1854 discard_it:
1855         /* Discard frame. */
1856         kfree_skb(skb);
1857         return 0;
1858
1859 discard_and_relse:
1860         sk_drops_add(sk, skb);
1861         if (refcounted)
1862                 sock_put(sk);
1863         goto discard_it;
1864
1865 do_time_wait:
1866         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1867                 inet_twsk_put(inet_twsk(sk));
1868                 goto discard_it;
1869         }
1870
1871         tcp_v4_fill_cb(skb, iph, th);
1872
1873         if (tcp_checksum_complete(skb)) {
1874                 inet_twsk_put(inet_twsk(sk));
1875                 goto csum_error;
1876         }
1877         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1878         case TCP_TW_SYN: {
1879                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1880                                                         &tcp_hashinfo, skb,
1881                                                         __tcp_hdrlen(th),
1882                                                         iph->saddr, th->source,
1883                                                         iph->daddr, th->dest,
1884                                                         inet_iif(skb),
1885                                                         sdif);
1886                 if (sk2) {
1887                         inet_twsk_deschedule_put(inet_twsk(sk));
1888                         sk = sk2;
1889                         tcp_v4_restore_cb(skb);
1890                         refcounted = false;
1891                         goto process;
1892                 }
1893         }
1894                 /* to ACK */
1895                 /* fall through */
1896         case TCP_TW_ACK:
1897                 tcp_v4_timewait_ack(sk, skb);
1898                 break;
1899         case TCP_TW_RST:
1900                 tcp_v4_send_reset(sk, skb);
1901                 inet_twsk_deschedule_put(inet_twsk(sk));
1902                 goto discard_it;
1903         case TCP_TW_SUCCESS:;
1904         }
1905         goto discard_it;
1906 }
1907
1908 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1909         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1910         .twsk_unique    = tcp_twsk_unique,
1911         .twsk_destructor= tcp_twsk_destructor,
1912 };
1913
1914 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1915 {
1916         struct dst_entry *dst = skb_dst(skb);
1917
1918         if (dst && dst_hold_safe(dst)) {
1919                 sk->sk_rx_dst = dst;
1920                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1921         }
1922 }
1923 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1924
1925 const struct inet_connection_sock_af_ops ipv4_specific = {
1926         .queue_xmit        = ip_queue_xmit,
1927         .send_check        = tcp_v4_send_check,
1928         .rebuild_header    = inet_sk_rebuild_header,
1929         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1930         .conn_request      = tcp_v4_conn_request,
1931         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1932         .net_header_len    = sizeof(struct iphdr),
1933         .setsockopt        = ip_setsockopt,
1934         .getsockopt        = ip_getsockopt,
1935         .addr2sockaddr     = inet_csk_addr2sockaddr,
1936         .sockaddr_len      = sizeof(struct sockaddr_in),
1937 #ifdef CONFIG_COMPAT
1938         .compat_setsockopt = compat_ip_setsockopt,
1939         .compat_getsockopt = compat_ip_getsockopt,
1940 #endif
1941         .mtu_reduced       = tcp_v4_mtu_reduced,
1942 };
1943 EXPORT_SYMBOL(ipv4_specific);
1944
1945 #ifdef CONFIG_TCP_MD5SIG
1946 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1947         .md5_lookup             = tcp_v4_md5_lookup,
1948         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1949         .md5_parse              = tcp_v4_parse_md5_keys,
1950 };
1951 #endif
1952
1953 /* NOTE: A lot of things set to zero explicitly by call to
1954  *       sk_alloc() so need not be done here.
1955  */
1956 static int tcp_v4_init_sock(struct sock *sk)
1957 {
1958         struct inet_connection_sock *icsk = inet_csk(sk);
1959
1960         tcp_init_sock(sk);
1961
1962         icsk->icsk_af_ops = &ipv4_specific;
1963
1964 #ifdef CONFIG_TCP_MD5SIG
1965         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1966 #endif
1967
1968         return 0;
1969 }
1970
1971 void tcp_v4_destroy_sock(struct sock *sk)
1972 {
1973         struct tcp_sock *tp = tcp_sk(sk);
1974
1975         trace_tcp_destroy_sock(sk);
1976
1977         tcp_clear_xmit_timers(sk);
1978
1979         tcp_cleanup_congestion_control(sk);
1980
1981         tcp_cleanup_ulp(sk);
1982
1983         /* Cleanup up the write buffer. */
1984         tcp_write_queue_purge(sk);
1985
1986         /* Check if we want to disable active TFO */
1987         tcp_fastopen_active_disable_ofo_check(sk);
1988
1989         /* Cleans up our, hopefully empty, out_of_order_queue. */
1990         skb_rbtree_purge(&tp->out_of_order_queue);
1991
1992 #ifdef CONFIG_TCP_MD5SIG
1993         /* Clean up the MD5 key list, if any */
1994         if (tp->md5sig_info) {
1995                 tcp_clear_md5_list(sk);
1996                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1997                 tp->md5sig_info = NULL;
1998         }
1999 #endif
2000
2001         /* Clean up a referenced TCP bind bucket. */
2002         if (inet_csk(sk)->icsk_bind_hash)
2003                 inet_put_port(sk);
2004
2005         BUG_ON(tp->fastopen_rsk);
2006
2007         /* If socket is aborted during connect operation */
2008         tcp_free_fastopen_req(tp);
2009         tcp_fastopen_destroy_cipher(sk);
2010         tcp_saved_syn_free(tp);
2011
2012         sk_sockets_allocated_dec(sk);
2013 }
2014 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2015
2016 #ifdef CONFIG_PROC_FS
2017 /* Proc filesystem TCP sock list dumping. */
2018
2019 /*
2020  * Get next listener socket follow cur.  If cur is NULL, get first socket
2021  * starting from bucket given in st->bucket; when st->bucket is zero the
2022  * very first socket in the hash table is returned.
2023  */
2024 static void *listening_get_next(struct seq_file *seq, void *cur)
2025 {
2026         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2027         struct tcp_iter_state *st = seq->private;
2028         struct net *net = seq_file_net(seq);
2029         struct inet_listen_hashbucket *ilb;
2030         struct sock *sk = cur;
2031
2032         if (!sk) {
2033 get_head:
2034                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2035                 spin_lock(&ilb->lock);
2036                 sk = sk_head(&ilb->head);
2037                 st->offset = 0;
2038                 goto get_sk;
2039         }
2040         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2041         ++st->num;
2042         ++st->offset;
2043
2044         sk = sk_next(sk);
2045 get_sk:
2046         sk_for_each_from(sk) {
2047                 if (!net_eq(sock_net(sk), net))
2048                         continue;
2049                 if (sk->sk_family == afinfo->family)
2050                         return sk;
2051         }
2052         spin_unlock(&ilb->lock);
2053         st->offset = 0;
2054         if (++st->bucket < INET_LHTABLE_SIZE)
2055                 goto get_head;
2056         return NULL;
2057 }
2058
2059 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2060 {
2061         struct tcp_iter_state *st = seq->private;
2062         void *rc;
2063
2064         st->bucket = 0;
2065         st->offset = 0;
2066         rc = listening_get_next(seq, NULL);
2067
2068         while (rc && *pos) {
2069                 rc = listening_get_next(seq, rc);
2070                 --*pos;
2071         }
2072         return rc;
2073 }
2074
2075 static inline bool empty_bucket(const struct tcp_iter_state *st)
2076 {
2077         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2078 }
2079
2080 /*
2081  * Get first established socket starting from bucket given in st->bucket.
2082  * If st->bucket is zero, the very first socket in the hash is returned.
2083  */
2084 static void *established_get_first(struct seq_file *seq)
2085 {
2086         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2087         struct tcp_iter_state *st = seq->private;
2088         struct net *net = seq_file_net(seq);
2089         void *rc = NULL;
2090
2091         st->offset = 0;
2092         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2093                 struct sock *sk;
2094                 struct hlist_nulls_node *node;
2095                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2096
2097                 /* Lockless fast path for the common case of empty buckets */
2098                 if (empty_bucket(st))
2099                         continue;
2100
2101                 spin_lock_bh(lock);
2102                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2103                         if (sk->sk_family != afinfo->family ||
2104                             !net_eq(sock_net(sk), net)) {
2105                                 continue;
2106                         }
2107                         rc = sk;
2108                         goto out;
2109                 }
2110                 spin_unlock_bh(lock);
2111         }
2112 out:
2113         return rc;
2114 }
2115
2116 static void *established_get_next(struct seq_file *seq, void *cur)
2117 {
2118         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2119         struct sock *sk = cur;
2120         struct hlist_nulls_node *node;
2121         struct tcp_iter_state *st = seq->private;
2122         struct net *net = seq_file_net(seq);
2123
2124         ++st->num;
2125         ++st->offset;
2126
2127         sk = sk_nulls_next(sk);
2128
2129         sk_nulls_for_each_from(sk, node) {
2130                 if (sk->sk_family == afinfo->family &&
2131                     net_eq(sock_net(sk), net))
2132                         return sk;
2133         }
2134
2135         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2136         ++st->bucket;
2137         return established_get_first(seq);
2138 }
2139
2140 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2141 {
2142         struct tcp_iter_state *st = seq->private;
2143         void *rc;
2144
2145         st->bucket = 0;
2146         rc = established_get_first(seq);
2147
2148         while (rc && pos) {
2149                 rc = established_get_next(seq, rc);
2150                 --pos;
2151         }
2152         return rc;
2153 }
2154
2155 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2156 {
2157         void *rc;
2158         struct tcp_iter_state *st = seq->private;
2159
2160         st->state = TCP_SEQ_STATE_LISTENING;
2161         rc        = listening_get_idx(seq, &pos);
2162
2163         if (!rc) {
2164                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2165                 rc        = established_get_idx(seq, pos);
2166         }
2167
2168         return rc;
2169 }
2170
2171 static void *tcp_seek_last_pos(struct seq_file *seq)
2172 {
2173         struct tcp_iter_state *st = seq->private;
2174         int offset = st->offset;
2175         int orig_num = st->num;
2176         void *rc = NULL;
2177
2178         switch (st->state) {
2179         case TCP_SEQ_STATE_LISTENING:
2180                 if (st->bucket >= INET_LHTABLE_SIZE)
2181                         break;
2182                 st->state = TCP_SEQ_STATE_LISTENING;
2183                 rc = listening_get_next(seq, NULL);
2184                 while (offset-- && rc)
2185                         rc = listening_get_next(seq, rc);
2186                 if (rc)
2187                         break;
2188                 st->bucket = 0;
2189                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2190                 /* Fallthrough */
2191         case TCP_SEQ_STATE_ESTABLISHED:
2192                 if (st->bucket > tcp_hashinfo.ehash_mask)
2193                         break;
2194                 rc = established_get_first(seq);
2195                 while (offset-- && rc)
2196                         rc = established_get_next(seq, rc);
2197         }
2198
2199         st->num = orig_num;
2200
2201         return rc;
2202 }
2203
2204 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2205 {
2206         struct tcp_iter_state *st = seq->private;
2207         void *rc;
2208
2209         if (*pos && *pos == st->last_pos) {
2210                 rc = tcp_seek_last_pos(seq);
2211                 if (rc)
2212                         goto out;
2213         }
2214
2215         st->state = TCP_SEQ_STATE_LISTENING;
2216         st->num = 0;
2217         st->bucket = 0;
2218         st->offset = 0;
2219         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2220
2221 out:
2222         st->last_pos = *pos;
2223         return rc;
2224 }
2225 EXPORT_SYMBOL(tcp_seq_start);
2226
2227 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2228 {
2229         struct tcp_iter_state *st = seq->private;
2230         void *rc = NULL;
2231
2232         if (v == SEQ_START_TOKEN) {
2233                 rc = tcp_get_idx(seq, 0);
2234                 goto out;
2235         }
2236
2237         switch (st->state) {
2238         case TCP_SEQ_STATE_LISTENING:
2239                 rc = listening_get_next(seq, v);
2240                 if (!rc) {
2241                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2242                         st->bucket = 0;
2243                         st->offset = 0;
2244                         rc        = established_get_first(seq);
2245                 }
2246                 break;
2247         case TCP_SEQ_STATE_ESTABLISHED:
2248                 rc = established_get_next(seq, v);
2249                 break;
2250         }
2251 out:
2252         ++*pos;
2253         st->last_pos = *pos;
2254         return rc;
2255 }
2256 EXPORT_SYMBOL(tcp_seq_next);
2257
2258 void tcp_seq_stop(struct seq_file *seq, void *v)
2259 {
2260         struct tcp_iter_state *st = seq->private;
2261
2262         switch (st->state) {
2263         case TCP_SEQ_STATE_LISTENING:
2264                 if (v != SEQ_START_TOKEN)
2265                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2266                 break;
2267         case TCP_SEQ_STATE_ESTABLISHED:
2268                 if (v)
2269                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2270                 break;
2271         }
2272 }
2273 EXPORT_SYMBOL(tcp_seq_stop);
2274
2275 static void get_openreq4(const struct request_sock *req,
2276                          struct seq_file *f, int i)
2277 {
2278         const struct inet_request_sock *ireq = inet_rsk(req);
2279         long delta = req->rsk_timer.expires - jiffies;
2280
2281         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2282                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2283                 i,
2284                 ireq->ir_loc_addr,
2285                 ireq->ir_num,
2286                 ireq->ir_rmt_addr,
2287                 ntohs(ireq->ir_rmt_port),
2288                 TCP_SYN_RECV,
2289                 0, 0, /* could print option size, but that is af dependent. */
2290                 1,    /* timers active (only the expire timer) */
2291                 jiffies_delta_to_clock_t(delta),
2292                 req->num_timeout,
2293                 from_kuid_munged(seq_user_ns(f),
2294                                  sock_i_uid(req->rsk_listener)),
2295                 0,  /* non standard timer */
2296                 0, /* open_requests have no inode */
2297                 0,
2298                 req);
2299 }
2300
2301 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2302 {
2303         int timer_active;
2304         unsigned long timer_expires;
2305         const struct tcp_sock *tp = tcp_sk(sk);
2306         const struct inet_connection_sock *icsk = inet_csk(sk);
2307         const struct inet_sock *inet = inet_sk(sk);
2308         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2309         __be32 dest = inet->inet_daddr;
2310         __be32 src = inet->inet_rcv_saddr;
2311         __u16 destp = ntohs(inet->inet_dport);
2312         __u16 srcp = ntohs(inet->inet_sport);
2313         int rx_queue;
2314         int state;
2315
2316         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2317             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2318             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2319                 timer_active    = 1;
2320                 timer_expires   = icsk->icsk_timeout;
2321         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2322                 timer_active    = 4;
2323                 timer_expires   = icsk->icsk_timeout;
2324         } else if (timer_pending(&sk->sk_timer)) {
2325                 timer_active    = 2;
2326                 timer_expires   = sk->sk_timer.expires;
2327         } else {
2328                 timer_active    = 0;
2329                 timer_expires = jiffies;
2330         }
2331
2332         state = inet_sk_state_load(sk);
2333         if (state == TCP_LISTEN)
2334                 rx_queue = sk->sk_ack_backlog;
2335         else
2336                 /* Because we don't lock the socket,
2337                  * we might find a transient negative value.
2338                  */
2339                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2340
2341         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2342                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2343                 i, src, srcp, dest, destp, state,
2344                 tp->write_seq - tp->snd_una,
2345                 rx_queue,
2346                 timer_active,
2347                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2348                 icsk->icsk_retransmits,
2349                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2350                 icsk->icsk_probes_out,
2351                 sock_i_ino(sk),
2352                 refcount_read(&sk->sk_refcnt), sk,
2353                 jiffies_to_clock_t(icsk->icsk_rto),
2354                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2355                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2356                 tp->snd_cwnd,
2357                 state == TCP_LISTEN ?
2358                     fastopenq->max_qlen :
2359                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2360 }
2361
2362 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2363                                struct seq_file *f, int i)
2364 {
2365         long delta = tw->tw_timer.expires - jiffies;
2366         __be32 dest, src;
2367         __u16 destp, srcp;
2368
2369         dest  = tw->tw_daddr;
2370         src   = tw->tw_rcv_saddr;
2371         destp = ntohs(tw->tw_dport);
2372         srcp  = ntohs(tw->tw_sport);
2373
2374         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2375                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2376                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2377                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2378                 refcount_read(&tw->tw_refcnt), tw);
2379 }
2380
2381 #define TMPSZ 150
2382
2383 static int tcp4_seq_show(struct seq_file *seq, void *v)
2384 {
2385         struct tcp_iter_state *st;
2386         struct sock *sk = v;
2387
2388         seq_setwidth(seq, TMPSZ - 1);
2389         if (v == SEQ_START_TOKEN) {
2390                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2391                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2392                            "inode");
2393                 goto out;
2394         }
2395         st = seq->private;
2396
2397         if (sk->sk_state == TCP_TIME_WAIT)
2398                 get_timewait4_sock(v, seq, st->num);
2399         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2400                 get_openreq4(v, seq, st->num);
2401         else
2402                 get_tcp4_sock(v, seq, st->num);
2403 out:
2404         seq_pad(seq, '\n');
2405         return 0;
2406 }
2407
2408 static const struct seq_operations tcp4_seq_ops = {
2409         .show           = tcp4_seq_show,
2410         .start          = tcp_seq_start,
2411         .next           = tcp_seq_next,
2412         .stop           = tcp_seq_stop,
2413 };
2414
2415 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2416         .family         = AF_INET,
2417 };
2418
2419 static int __net_init tcp4_proc_init_net(struct net *net)
2420 {
2421         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2422                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2423                 return -ENOMEM;
2424         return 0;
2425 }
2426
2427 static void __net_exit tcp4_proc_exit_net(struct net *net)
2428 {
2429         remove_proc_entry("tcp", net->proc_net);
2430 }
2431
2432 static struct pernet_operations tcp4_net_ops = {
2433         .init = tcp4_proc_init_net,
2434         .exit = tcp4_proc_exit_net,
2435 };
2436
2437 int __init tcp4_proc_init(void)
2438 {
2439         return register_pernet_subsys(&tcp4_net_ops);
2440 }
2441
2442 void tcp4_proc_exit(void)
2443 {
2444         unregister_pernet_subsys(&tcp4_net_ops);
2445 }
2446 #endif /* CONFIG_PROC_FS */
2447
2448 struct proto tcp_prot = {
2449         .name                   = "TCP",
2450         .owner                  = THIS_MODULE,
2451         .close                  = tcp_close,
2452         .pre_connect            = tcp_v4_pre_connect,
2453         .connect                = tcp_v4_connect,
2454         .disconnect             = tcp_disconnect,
2455         .accept                 = inet_csk_accept,
2456         .ioctl                  = tcp_ioctl,
2457         .init                   = tcp_v4_init_sock,
2458         .destroy                = tcp_v4_destroy_sock,
2459         .shutdown               = tcp_shutdown,
2460         .setsockopt             = tcp_setsockopt,
2461         .getsockopt             = tcp_getsockopt,
2462         .keepalive              = tcp_set_keepalive,
2463         .recvmsg                = tcp_recvmsg,
2464         .sendmsg                = tcp_sendmsg,
2465         .sendpage               = tcp_sendpage,
2466         .backlog_rcv            = tcp_v4_do_rcv,
2467         .release_cb             = tcp_release_cb,
2468         .hash                   = inet_hash,
2469         .unhash                 = inet_unhash,
2470         .get_port               = inet_csk_get_port,
2471         .enter_memory_pressure  = tcp_enter_memory_pressure,
2472         .leave_memory_pressure  = tcp_leave_memory_pressure,
2473         .stream_memory_free     = tcp_stream_memory_free,
2474         .sockets_allocated      = &tcp_sockets_allocated,
2475         .orphan_count           = &tcp_orphan_count,
2476         .memory_allocated       = &tcp_memory_allocated,
2477         .memory_pressure        = &tcp_memory_pressure,
2478         .sysctl_mem             = sysctl_tcp_mem,
2479         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2480         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2481         .max_header             = MAX_TCP_HEADER,
2482         .obj_size               = sizeof(struct tcp_sock),
2483         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2484         .twsk_prot              = &tcp_timewait_sock_ops,
2485         .rsk_prot               = &tcp_request_sock_ops,
2486         .h.hashinfo             = &tcp_hashinfo,
2487         .no_autobind            = true,
2488 #ifdef CONFIG_COMPAT
2489         .compat_setsockopt      = compat_tcp_setsockopt,
2490         .compat_getsockopt      = compat_tcp_getsockopt,
2491 #endif
2492         .diag_destroy           = tcp_abort,
2493 };
2494 EXPORT_SYMBOL(tcp_prot);
2495
2496 static void __net_exit tcp_sk_exit(struct net *net)
2497 {
2498         int cpu;
2499
2500         module_put(net->ipv4.tcp_congestion_control->owner);
2501
2502         for_each_possible_cpu(cpu)
2503                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2504         free_percpu(net->ipv4.tcp_sk);
2505 }
2506
2507 static int __net_init tcp_sk_init(struct net *net)
2508 {
2509         int res, cpu, cnt;
2510
2511         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2512         if (!net->ipv4.tcp_sk)
2513                 return -ENOMEM;
2514
2515         for_each_possible_cpu(cpu) {
2516                 struct sock *sk;
2517
2518                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2519                                            IPPROTO_TCP, net);
2520                 if (res)
2521                         goto fail;
2522                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2523
2524                 /* Please enforce IP_DF and IPID==0 for RST and
2525                  * ACK sent in SYN-RECV and TIME-WAIT state.
2526                  */
2527                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2528
2529                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2530         }
2531
2532         net->ipv4.sysctl_tcp_ecn = 2;
2533         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2534
2535         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2536         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2537         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2538
2539         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2540         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2541         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2542
2543         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2544         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2545         net->ipv4.sysctl_tcp_syncookies = 1;
2546         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2547         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2548         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2549         net->ipv4.sysctl_tcp_orphan_retries = 0;
2550         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2551         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2552         net->ipv4.sysctl_tcp_tw_reuse = 2;
2553
2554         cnt = tcp_hashinfo.ehash_mask + 1;
2555         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2556         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2557
2558         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2559         net->ipv4.sysctl_tcp_sack = 1;
2560         net->ipv4.sysctl_tcp_window_scaling = 1;
2561         net->ipv4.sysctl_tcp_timestamps = 1;
2562         net->ipv4.sysctl_tcp_early_retrans = 3;
2563         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2564         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2565         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2566         net->ipv4.sysctl_tcp_max_reordering = 300;
2567         net->ipv4.sysctl_tcp_dsack = 1;
2568         net->ipv4.sysctl_tcp_app_win = 31;
2569         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2570         net->ipv4.sysctl_tcp_frto = 2;
2571         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2572         /* This limits the percentage of the congestion window which we
2573          * will allow a single TSO frame to consume.  Building TSO frames
2574          * which are too large can cause TCP streams to be bursty.
2575          */
2576         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2577         /* Default TSQ limit of four TSO segments */
2578         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2579         /* rfc5961 challenge ack rate limiting */
2580         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2581         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2582         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2583         net->ipv4.sysctl_tcp_autocorking = 1;
2584         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2585         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2586         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2587         if (net != &init_net) {
2588                 memcpy(net->ipv4.sysctl_tcp_rmem,
2589                        init_net.ipv4.sysctl_tcp_rmem,
2590                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2591                 memcpy(net->ipv4.sysctl_tcp_wmem,
2592                        init_net.ipv4.sysctl_tcp_wmem,
2593                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2594         }
2595         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2596         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2597         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2598         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2599         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2600         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2601
2602         /* Reno is always built in */
2603         if (!net_eq(net, &init_net) &&
2604             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2605                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2606         else
2607                 net->ipv4.tcp_congestion_control = &tcp_reno;
2608
2609         return 0;
2610 fail:
2611         tcp_sk_exit(net);
2612
2613         return res;
2614 }
2615
2616 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2617 {
2618         struct net *net;
2619
2620         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2621
2622         list_for_each_entry(net, net_exit_list, exit_list)
2623                 tcp_fastopen_ctx_destroy(net);
2624 }
2625
2626 static struct pernet_operations __net_initdata tcp_sk_ops = {
2627        .init       = tcp_sk_init,
2628        .exit       = tcp_sk_exit,
2629        .exit_batch = tcp_sk_exit_batch,
2630 };
2631
2632 void __init tcp_v4_init(void)
2633 {
2634         if (register_pernet_subsys(&tcp_sk_ops))
2635                 panic("Failed to create the TCP control socket.\n");
2636 }