Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
[linux-2.6-microblaze.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              sock_owned_by_user(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223
224         sk_set_txhash(sk);
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286
287         mtu = dst_mtu(dst);
288
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         if (seq != tcp_rsk(req)->snt_isn) {
323                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
324         } else if (abort) {
325                 /*
326                  * Still in SYN_RECV, just remove it silently.
327                  * There is no good way to pass the error to the newly
328                  * created socket, and POSIX does not want network
329                  * errors returned from accept().
330                  */
331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
333         }
334         reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358         struct inet_connection_sock *icsk;
359         struct tcp_sock *tp;
360         struct inet_sock *inet;
361         const int type = icmp_hdr(icmp_skb)->type;
362         const int code = icmp_hdr(icmp_skb)->code;
363         struct sock *sk;
364         struct sk_buff *skb;
365         struct request_sock *fastopen;
366         __u32 seq, snd_una;
367         __u32 remaining;
368         int err;
369         struct net *net = dev_net(icmp_skb->dev);
370
371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372                                        th->dest, iph->saddr, ntohs(th->source),
373                                        inet_iif(icmp_skb));
374         if (!sk) {
375                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
376                 return;
377         }
378         if (sk->sk_state == TCP_TIME_WAIT) {
379                 inet_twsk_put(inet_twsk(sk));
380                 return;
381         }
382         seq = ntohl(th->seq);
383         if (sk->sk_state == TCP_NEW_SYN_RECV)
384                 return tcp_req_err(sk, seq,
385                                   type == ICMP_PARAMETERPROB ||
386                                   type == ICMP_TIME_EXCEEDED ||
387                                   (type == ICMP_DEST_UNREACH &&
388                                    (code == ICMP_NET_UNREACH ||
389                                     code == ICMP_HOST_UNREACH)));
390
391         bh_lock_sock(sk);
392         /* If too many ICMPs get dropped on busy
393          * servers this needs to be solved differently.
394          * We do take care of PMTU discovery (RFC1191) special case :
395          * we can receive locally generated ICMP messages while socket is held.
396          */
397         if (sock_owned_by_user(sk)) {
398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
400         }
401         if (sk->sk_state == TCP_CLOSE)
402                 goto out;
403
404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
406                 goto out;
407         }
408
409         icsk = inet_csk(sk);
410         tp = tcp_sk(sk);
411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412         fastopen = tp->fastopen_rsk;
413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414         if (sk->sk_state != TCP_LISTEN &&
415             !between(seq, snd_una, tp->snd_nxt)) {
416                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
417                 goto out;
418         }
419
420         switch (type) {
421         case ICMP_REDIRECT:
422                 do_redirect(icmp_skb, sk);
423                 goto out;
424         case ICMP_SOURCE_QUENCH:
425                 /* Just silently ignore these. */
426                 goto out;
427         case ICMP_PARAMETERPROB:
428                 err = EPROTO;
429                 break;
430         case ICMP_DEST_UNREACH:
431                 if (code > NR_ICMP_UNREACH)
432                         goto out;
433
434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435                         /* We are not interested in TCP_LISTEN and open_requests
436                          * (SYN-ACKs send out by Linux are always <576bytes so
437                          * they should go through unfragmented).
438                          */
439                         if (sk->sk_state == TCP_LISTEN)
440                                 goto out;
441
442                         tp->mtu_info = info;
443                         if (!sock_owned_by_user(sk)) {
444                                 tcp_v4_mtu_reduced(sk);
445                         } else {
446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447                                         sock_hold(sk);
448                         }
449                         goto out;
450                 }
451
452                 err = icmp_err_convert[code].errno;
453                 /* check if icmp_skb allows revert of backoff
454                  * (see draft-zimmermann-tcp-lcd) */
455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456                         break;
457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458                     !icsk->icsk_backoff || fastopen)
459                         break;
460
461                 if (sock_owned_by_user(sk))
462                         break;
463
464                 icsk->icsk_backoff--;
465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466                                                TCP_TIMEOUT_INIT;
467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468
469                 skb = tcp_write_queue_head(sk);
470                 BUG_ON(!skb);
471
472                 remaining = icsk->icsk_rto -
473                             min(icsk->icsk_rto,
474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
475
476                 if (remaining) {
477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478                                                   remaining, TCP_RTO_MAX);
479                 } else {
480                         /* RTO revert clocked out retransmission.
481                          * Will retransmit now */
482                         tcp_retransmit_timer(sk);
483                 }
484
485                 break;
486         case ICMP_TIME_EXCEEDED:
487                 err = EHOSTUNREACH;
488                 break;
489         default:
490                 goto out;
491         }
492
493         switch (sk->sk_state) {
494         case TCP_SYN_SENT:
495         case TCP_SYN_RECV:
496                 /* Only in fast or simultaneous open. If a fast open socket is
497                  * is already accepted it is treated as a connected one below.
498                  */
499                 if (fastopen && !fastopen->sk)
500                         break;
501
502                 if (!sock_owned_by_user(sk)) {
503                         sk->sk_err = err;
504
505                         sk->sk_error_report(sk);
506
507                         tcp_done(sk);
508                 } else {
509                         sk->sk_err_soft = err;
510                 }
511                 goto out;
512         }
513
514         /* If we've already connected we will keep trying
515          * until we time out, or the user gives up.
516          *
517          * rfc1122 4.2.3.9 allows to consider as hard errors
518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519          * but it is obsoleted by pmtu discovery).
520          *
521          * Note, that in modern internet, where routing is unreliable
522          * and in each dark corner broken firewalls sit, sending random
523          * errors ordered by their masters even this two messages finally lose
524          * their original sense (even Linux sends invalid PORT_UNREACHs)
525          *
526          * Now we are in compliance with RFCs.
527          *                                                      --ANK (980905)
528          */
529
530         inet = inet_sk(sk);
531         if (!sock_owned_by_user(sk) && inet->recverr) {
532                 sk->sk_err = err;
533                 sk->sk_error_report(sk);
534         } else  { /* Only an error on timeout */
535                 sk->sk_err_soft = err;
536         }
537
538 out:
539         bh_unlock_sock(sk);
540         sock_put(sk);
541 }
542
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545         struct tcphdr *th = tcp_hdr(skb);
546
547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549                 skb->csum_start = skb_transport_header(skb) - skb->head;
550                 skb->csum_offset = offsetof(struct tcphdr, check);
551         } else {
552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
553                                          csum_partial(th,
554                                                       th->doff << 2,
555                                                       skb->csum));
556         }
557 }
558
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562         const struct inet_sock *inet = inet_sk(sk);
563
564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567
568 /*
569  *      This routine will send an RST to the other tcp.
570  *
571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *                    for reset.
573  *      Answer: if a packet caused RST, it is not for a socket
574  *              existing in our system, if it is matched to a socket,
575  *              it is just duplicate segment or bug in other side's TCP.
576  *              So that we build reply only basing on parameters
577  *              arrived with segment.
578  *      Exception: precedence violation. We do not implement it in any case.
579  */
580
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583         const struct tcphdr *th = tcp_hdr(skb);
584         struct {
585                 struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589         } rep;
590         struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592         struct tcp_md5sig_key *key = NULL;
593         const __u8 *hash_location = NULL;
594         unsigned char newhash[16];
595         int genhash;
596         struct sock *sk1 = NULL;
597 #endif
598         struct net *net;
599
600         /* Never send a reset in response to a reset. */
601         if (th->rst)
602                 return;
603
604         /* If sk not NULL, it means we did a successful lookup and incoming
605          * route had to be correct. prequeue might have dropped our dst.
606          */
607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608                 return;
609
610         /* Swap the send and the receive. */
611         memset(&rep, 0, sizeof(rep));
612         rep.th.dest   = th->source;
613         rep.th.source = th->dest;
614         rep.th.doff   = sizeof(struct tcphdr) / 4;
615         rep.th.rst    = 1;
616
617         if (th->ack) {
618                 rep.th.seq = th->ack_seq;
619         } else {
620                 rep.th.ack = 1;
621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622                                        skb->len - (th->doff << 2));
623         }
624
625         memset(&arg, 0, sizeof(arg));
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628
629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631         hash_location = tcp_parse_md5sig_option(th);
632         if (sk && sk_fullsock(sk)) {
633                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
634                                         &ip_hdr(skb)->saddr, AF_INET);
635         } else if (hash_location) {
636                 /*
637                  * active side is lost. Try to find listening socket through
638                  * source port, and then find md5 key through listening socket.
639                  * we are not loose security here:
640                  * Incoming packet is checked with md5 hash with finding key,
641                  * no RST generated if md5 hash doesn't match.
642                  */
643                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
644                                              ip_hdr(skb)->saddr,
645                                              th->source, ip_hdr(skb)->daddr,
646                                              ntohs(th->source), inet_iif(skb));
647                 /* don't send rst if it can't find key */
648                 if (!sk1)
649                         return;
650                 rcu_read_lock();
651                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
652                                         &ip_hdr(skb)->saddr, AF_INET);
653                 if (!key)
654                         goto release_sk1;
655
656                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
657                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
658                         goto release_sk1;
659         }
660
661         if (key) {
662                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
663                                    (TCPOPT_NOP << 16) |
664                                    (TCPOPT_MD5SIG << 8) |
665                                    TCPOLEN_MD5SIG);
666                 /* Update length and the length the header thinks exists */
667                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
668                 rep.th.doff = arg.iov[0].iov_len / 4;
669
670                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
671                                      key, ip_hdr(skb)->saddr,
672                                      ip_hdr(skb)->daddr, &rep.th);
673         }
674 #endif
675         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
676                                       ip_hdr(skb)->saddr, /* XXX */
677                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
678         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
679         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
680
681         /* When socket is gone, all binding information is lost.
682          * routing might fail in this case. No choice here, if we choose to force
683          * input interface, we will misroute in case of asymmetric route.
684          */
685         if (sk)
686                 arg.bound_dev_if = sk->sk_bound_dev_if;
687
688         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
689                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
690
691         arg.tos = ip_hdr(skb)->tos;
692         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
693                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
694                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
695                               &arg, arg.iov[0].iov_len);
696
697         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
698         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
699
700 #ifdef CONFIG_TCP_MD5SIG
701 release_sk1:
702         if (sk1) {
703                 rcu_read_unlock();
704                 sock_put(sk1);
705         }
706 #endif
707 }
708
709 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
710    outside socket context is ugly, certainly. What can I do?
711  */
712
713 static void tcp_v4_send_ack(struct net *net,
714                             struct sk_buff *skb, u32 seq, u32 ack,
715                             u32 win, u32 tsval, u32 tsecr, int oif,
716                             struct tcp_md5sig_key *key,
717                             int reply_flags, u8 tos)
718 {
719         const struct tcphdr *th = tcp_hdr(skb);
720         struct {
721                 struct tcphdr th;
722                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
723 #ifdef CONFIG_TCP_MD5SIG
724                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
725 #endif
726                         ];
727         } rep;
728         struct ip_reply_arg arg;
729
730         memset(&rep.th, 0, sizeof(struct tcphdr));
731         memset(&arg, 0, sizeof(arg));
732
733         arg.iov[0].iov_base = (unsigned char *)&rep;
734         arg.iov[0].iov_len  = sizeof(rep.th);
735         if (tsecr) {
736                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
737                                    (TCPOPT_TIMESTAMP << 8) |
738                                    TCPOLEN_TIMESTAMP);
739                 rep.opt[1] = htonl(tsval);
740                 rep.opt[2] = htonl(tsecr);
741                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
742         }
743
744         /* Swap the send and the receive. */
745         rep.th.dest    = th->source;
746         rep.th.source  = th->dest;
747         rep.th.doff    = arg.iov[0].iov_len / 4;
748         rep.th.seq     = htonl(seq);
749         rep.th.ack_seq = htonl(ack);
750         rep.th.ack     = 1;
751         rep.th.window  = htons(win);
752
753 #ifdef CONFIG_TCP_MD5SIG
754         if (key) {
755                 int offset = (tsecr) ? 3 : 0;
756
757                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
758                                           (TCPOPT_NOP << 16) |
759                                           (TCPOPT_MD5SIG << 8) |
760                                           TCPOLEN_MD5SIG);
761                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
762                 rep.th.doff = arg.iov[0].iov_len/4;
763
764                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
765                                     key, ip_hdr(skb)->saddr,
766                                     ip_hdr(skb)->daddr, &rep.th);
767         }
768 #endif
769         arg.flags = reply_flags;
770         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
771                                       ip_hdr(skb)->saddr, /* XXX */
772                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
773         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
774         if (oif)
775                 arg.bound_dev_if = oif;
776         arg.tos = tos;
777         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
778                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
779                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
780                               &arg, arg.iov[0].iov_len);
781
782         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
783 }
784
785 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
786 {
787         struct inet_timewait_sock *tw = inet_twsk(sk);
788         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
789
790         tcp_v4_send_ack(sock_net(sk), skb,
791                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
792                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
793                         tcp_time_stamp + tcptw->tw_ts_offset,
794                         tcptw->tw_ts_recent,
795                         tw->tw_bound_dev_if,
796                         tcp_twsk_md5_key(tcptw),
797                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
798                         tw->tw_tos
799                         );
800
801         inet_twsk_put(tw);
802 }
803
804 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
805                                   struct request_sock *req)
806 {
807         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
808          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
809          */
810         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
811                                              tcp_sk(sk)->snd_nxt;
812
813         tcp_v4_send_ack(sock_net(sk), skb, seq,
814                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
815                         tcp_time_stamp,
816                         req->ts_recent,
817                         0,
818                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
819                                           AF_INET),
820                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
821                         ip_hdr(skb)->tos);
822 }
823
824 /*
825  *      Send a SYN-ACK after having received a SYN.
826  *      This still operates on a request_sock only, not on a big
827  *      socket.
828  */
829 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
830                               struct flowi *fl,
831                               struct request_sock *req,
832                               struct tcp_fastopen_cookie *foc,
833                                   bool attach_req)
834 {
835         const struct inet_request_sock *ireq = inet_rsk(req);
836         struct flowi4 fl4;
837         int err = -1;
838         struct sk_buff *skb;
839
840         /* First, grab a route. */
841         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
842                 return -1;
843
844         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
845
846         if (skb) {
847                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
848
849                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
850                                             ireq->ir_rmt_addr,
851                                             ireq->opt);
852                 err = net_xmit_eval(err);
853         }
854
855         return err;
856 }
857
858 /*
859  *      IPv4 request_sock destructor.
860  */
861 static void tcp_v4_reqsk_destructor(struct request_sock *req)
862 {
863         kfree(inet_rsk(req)->opt);
864 }
865
866 #ifdef CONFIG_TCP_MD5SIG
867 /*
868  * RFC2385 MD5 checksumming requires a mapping of
869  * IP address->MD5 Key.
870  * We need to maintain these in the sk structure.
871  */
872
873 /* Find the Key structure for an address.  */
874 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
875                                          const union tcp_md5_addr *addr,
876                                          int family)
877 {
878         const struct tcp_sock *tp = tcp_sk(sk);
879         struct tcp_md5sig_key *key;
880         unsigned int size = sizeof(struct in_addr);
881         const struct tcp_md5sig_info *md5sig;
882
883         /* caller either holds rcu_read_lock() or socket lock */
884         md5sig = rcu_dereference_check(tp->md5sig_info,
885                                        sock_owned_by_user(sk) ||
886                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
887         if (!md5sig)
888                 return NULL;
889 #if IS_ENABLED(CONFIG_IPV6)
890         if (family == AF_INET6)
891                 size = sizeof(struct in6_addr);
892 #endif
893         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
894                 if (key->family != family)
895                         continue;
896                 if (!memcmp(&key->addr, addr, size))
897                         return key;
898         }
899         return NULL;
900 }
901 EXPORT_SYMBOL(tcp_md5_do_lookup);
902
903 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
904                                          const struct sock *addr_sk)
905 {
906         const union tcp_md5_addr *addr;
907
908         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
909         return tcp_md5_do_lookup(sk, addr, AF_INET);
910 }
911 EXPORT_SYMBOL(tcp_v4_md5_lookup);
912
913 /* This can be called on a newly created socket, from other files */
914 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
915                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
916 {
917         /* Add Key to the list */
918         struct tcp_md5sig_key *key;
919         struct tcp_sock *tp = tcp_sk(sk);
920         struct tcp_md5sig_info *md5sig;
921
922         key = tcp_md5_do_lookup(sk, addr, family);
923         if (key) {
924                 /* Pre-existing entry - just update that one. */
925                 memcpy(key->key, newkey, newkeylen);
926                 key->keylen = newkeylen;
927                 return 0;
928         }
929
930         md5sig = rcu_dereference_protected(tp->md5sig_info,
931                                            sock_owned_by_user(sk) ||
932                                            lockdep_is_held(&sk->sk_lock.slock));
933         if (!md5sig) {
934                 md5sig = kmalloc(sizeof(*md5sig), gfp);
935                 if (!md5sig)
936                         return -ENOMEM;
937
938                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
939                 INIT_HLIST_HEAD(&md5sig->head);
940                 rcu_assign_pointer(tp->md5sig_info, md5sig);
941         }
942
943         key = sock_kmalloc(sk, sizeof(*key), gfp);
944         if (!key)
945                 return -ENOMEM;
946         if (!tcp_alloc_md5sig_pool()) {
947                 sock_kfree_s(sk, key, sizeof(*key));
948                 return -ENOMEM;
949         }
950
951         memcpy(key->key, newkey, newkeylen);
952         key->keylen = newkeylen;
953         key->family = family;
954         memcpy(&key->addr, addr,
955                (family == AF_INET6) ? sizeof(struct in6_addr) :
956                                       sizeof(struct in_addr));
957         hlist_add_head_rcu(&key->node, &md5sig->head);
958         return 0;
959 }
960 EXPORT_SYMBOL(tcp_md5_do_add);
961
962 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
963 {
964         struct tcp_md5sig_key *key;
965
966         key = tcp_md5_do_lookup(sk, addr, family);
967         if (!key)
968                 return -ENOENT;
969         hlist_del_rcu(&key->node);
970         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
971         kfree_rcu(key, rcu);
972         return 0;
973 }
974 EXPORT_SYMBOL(tcp_md5_do_del);
975
976 static void tcp_clear_md5_list(struct sock *sk)
977 {
978         struct tcp_sock *tp = tcp_sk(sk);
979         struct tcp_md5sig_key *key;
980         struct hlist_node *n;
981         struct tcp_md5sig_info *md5sig;
982
983         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
984
985         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
986                 hlist_del_rcu(&key->node);
987                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
988                 kfree_rcu(key, rcu);
989         }
990 }
991
992 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
993                                  int optlen)
994 {
995         struct tcp_md5sig cmd;
996         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
997
998         if (optlen < sizeof(cmd))
999                 return -EINVAL;
1000
1001         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1002                 return -EFAULT;
1003
1004         if (sin->sin_family != AF_INET)
1005                 return -EINVAL;
1006
1007         if (!cmd.tcpm_keylen)
1008                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1009                                       AF_INET);
1010
1011         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1012                 return -EINVAL;
1013
1014         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1015                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1016                               GFP_KERNEL);
1017 }
1018
1019 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1020                                         __be32 daddr, __be32 saddr, int nbytes)
1021 {
1022         struct tcp4_pseudohdr *bp;
1023         struct scatterlist sg;
1024
1025         bp = &hp->md5_blk.ip4;
1026
1027         /*
1028          * 1. the TCP pseudo-header (in the order: source IP address,
1029          * destination IP address, zero-padded protocol number, and
1030          * segment length)
1031          */
1032         bp->saddr = saddr;
1033         bp->daddr = daddr;
1034         bp->pad = 0;
1035         bp->protocol = IPPROTO_TCP;
1036         bp->len = cpu_to_be16(nbytes);
1037
1038         sg_init_one(&sg, bp, sizeof(*bp));
1039         ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1040         return crypto_ahash_update(hp->md5_req);
1041 }
1042
1043 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1044                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1045 {
1046         struct tcp_md5sig_pool *hp;
1047         struct ahash_request *req;
1048
1049         hp = tcp_get_md5sig_pool();
1050         if (!hp)
1051                 goto clear_hash_noput;
1052         req = hp->md5_req;
1053
1054         if (crypto_ahash_init(req))
1055                 goto clear_hash;
1056         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1057                 goto clear_hash;
1058         if (tcp_md5_hash_header(hp, th))
1059                 goto clear_hash;
1060         if (tcp_md5_hash_key(hp, key))
1061                 goto clear_hash;
1062         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1063         if (crypto_ahash_final(req))
1064                 goto clear_hash;
1065
1066         tcp_put_md5sig_pool();
1067         return 0;
1068
1069 clear_hash:
1070         tcp_put_md5sig_pool();
1071 clear_hash_noput:
1072         memset(md5_hash, 0, 16);
1073         return 1;
1074 }
1075
1076 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1077                         const struct sock *sk,
1078                         const struct sk_buff *skb)
1079 {
1080         struct tcp_md5sig_pool *hp;
1081         struct ahash_request *req;
1082         const struct tcphdr *th = tcp_hdr(skb);
1083         __be32 saddr, daddr;
1084
1085         if (sk) { /* valid for establish/request sockets */
1086                 saddr = sk->sk_rcv_saddr;
1087                 daddr = sk->sk_daddr;
1088         } else {
1089                 const struct iphdr *iph = ip_hdr(skb);
1090                 saddr = iph->saddr;
1091                 daddr = iph->daddr;
1092         }
1093
1094         hp = tcp_get_md5sig_pool();
1095         if (!hp)
1096                 goto clear_hash_noput;
1097         req = hp->md5_req;
1098
1099         if (crypto_ahash_init(req))
1100                 goto clear_hash;
1101
1102         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1103                 goto clear_hash;
1104         if (tcp_md5_hash_header(hp, th))
1105                 goto clear_hash;
1106         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1107                 goto clear_hash;
1108         if (tcp_md5_hash_key(hp, key))
1109                 goto clear_hash;
1110         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1111         if (crypto_ahash_final(req))
1112                 goto clear_hash;
1113
1114         tcp_put_md5sig_pool();
1115         return 0;
1116
1117 clear_hash:
1118         tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120         memset(md5_hash, 0, 16);
1121         return 1;
1122 }
1123 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1124
1125 #endif
1126
1127 /* Called with rcu_read_lock() */
1128 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1129                                     const struct sk_buff *skb)
1130 {
1131 #ifdef CONFIG_TCP_MD5SIG
1132         /*
1133          * This gets called for each TCP segment that arrives
1134          * so we want to be efficient.
1135          * We have 3 drop cases:
1136          * o No MD5 hash and one expected.
1137          * o MD5 hash and we're not expecting one.
1138          * o MD5 hash and its wrong.
1139          */
1140         const __u8 *hash_location = NULL;
1141         struct tcp_md5sig_key *hash_expected;
1142         const struct iphdr *iph = ip_hdr(skb);
1143         const struct tcphdr *th = tcp_hdr(skb);
1144         int genhash;
1145         unsigned char newhash[16];
1146
1147         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1148                                           AF_INET);
1149         hash_location = tcp_parse_md5sig_option(th);
1150
1151         /* We've parsed the options - do we have a hash? */
1152         if (!hash_expected && !hash_location)
1153                 return false;
1154
1155         if (hash_expected && !hash_location) {
1156                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1157                 return true;
1158         }
1159
1160         if (!hash_expected && hash_location) {
1161                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1162                 return true;
1163         }
1164
1165         /* Okay, so this is hash_expected and hash_location -
1166          * so we need to calculate the checksum.
1167          */
1168         genhash = tcp_v4_md5_hash_skb(newhash,
1169                                       hash_expected,
1170                                       NULL, skb);
1171
1172         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1174                                      &iph->saddr, ntohs(th->source),
1175                                      &iph->daddr, ntohs(th->dest),
1176                                      genhash ? " tcp_v4_calc_md5_hash failed"
1177                                      : "");
1178                 return true;
1179         }
1180         return false;
1181 #endif
1182         return false;
1183 }
1184
1185 static void tcp_v4_init_req(struct request_sock *req,
1186                             const struct sock *sk_listener,
1187                             struct sk_buff *skb)
1188 {
1189         struct inet_request_sock *ireq = inet_rsk(req);
1190
1191         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1192         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1193         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1194         ireq->opt = tcp_v4_save_options(skb);
1195 }
1196
1197 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1198                                           struct flowi *fl,
1199                                           const struct request_sock *req,
1200                                           bool *strict)
1201 {
1202         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1203
1204         if (strict) {
1205                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1206                         *strict = true;
1207                 else
1208                         *strict = false;
1209         }
1210
1211         return dst;
1212 }
1213
1214 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1215         .family         =       PF_INET,
1216         .obj_size       =       sizeof(struct tcp_request_sock),
1217         .rtx_syn_ack    =       tcp_rtx_synack,
1218         .send_ack       =       tcp_v4_reqsk_send_ack,
1219         .destructor     =       tcp_v4_reqsk_destructor,
1220         .send_reset     =       tcp_v4_send_reset,
1221         .syn_ack_timeout =      tcp_syn_ack_timeout,
1222 };
1223
1224 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1225         .mss_clamp      =       TCP_MSS_DEFAULT,
1226 #ifdef CONFIG_TCP_MD5SIG
1227         .req_md5_lookup =       tcp_v4_md5_lookup,
1228         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1229 #endif
1230         .init_req       =       tcp_v4_init_req,
1231 #ifdef CONFIG_SYN_COOKIES
1232         .cookie_init_seq =      cookie_v4_init_sequence,
1233 #endif
1234         .route_req      =       tcp_v4_route_req,
1235         .init_seq       =       tcp_v4_init_sequence,
1236         .send_synack    =       tcp_v4_send_synack,
1237 };
1238
1239 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1240 {
1241         /* Never answer to SYNs send to broadcast or multicast */
1242         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243                 goto drop;
1244
1245         return tcp_conn_request(&tcp_request_sock_ops,
1246                                 &tcp_request_sock_ipv4_ops, sk, skb);
1247
1248 drop:
1249         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1250         return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_v4_conn_request);
1253
1254
1255 /*
1256  * The three way handshake has completed - we got a valid synack -
1257  * now create the new socket.
1258  */
1259 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1260                                   struct request_sock *req,
1261                                   struct dst_entry *dst,
1262                                   struct request_sock *req_unhash,
1263                                   bool *own_req)
1264 {
1265         struct inet_request_sock *ireq;
1266         struct inet_sock *newinet;
1267         struct tcp_sock *newtp;
1268         struct sock *newsk;
1269 #ifdef CONFIG_TCP_MD5SIG
1270         struct tcp_md5sig_key *key;
1271 #endif
1272         struct ip_options_rcu *inet_opt;
1273
1274         if (sk_acceptq_is_full(sk))
1275                 goto exit_overflow;
1276
1277         newsk = tcp_create_openreq_child(sk, req, skb);
1278         if (!newsk)
1279                 goto exit_nonewsk;
1280
1281         newsk->sk_gso_type = SKB_GSO_TCPV4;
1282         inet_sk_rx_dst_set(newsk, skb);
1283
1284         newtp                 = tcp_sk(newsk);
1285         newinet               = inet_sk(newsk);
1286         ireq                  = inet_rsk(req);
1287         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1288         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1289         newsk->sk_bound_dev_if = ireq->ir_iif;
1290         newinet->inet_saddr           = ireq->ir_loc_addr;
1291         inet_opt              = ireq->opt;
1292         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1293         ireq->opt             = NULL;
1294         newinet->mc_index     = inet_iif(skb);
1295         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1296         newinet->rcv_tos      = ip_hdr(skb)->tos;
1297         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1298         if (inet_opt)
1299                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1300         newinet->inet_id = newtp->write_seq ^ jiffies;
1301
1302         if (!dst) {
1303                 dst = inet_csk_route_child_sock(sk, newsk, req);
1304                 if (!dst)
1305                         goto put_and_exit;
1306         } else {
1307                 /* syncookie case : see end of cookie_v4_check() */
1308         }
1309         sk_setup_caps(newsk, dst);
1310
1311         tcp_ca_openreq_child(newsk, dst);
1312
1313         tcp_sync_mss(newsk, dst_mtu(dst));
1314         newtp->advmss = dst_metric_advmss(dst);
1315         if (tcp_sk(sk)->rx_opt.user_mss &&
1316             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1317                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1318
1319         tcp_initialize_rcv_mss(newsk);
1320
1321 #ifdef CONFIG_TCP_MD5SIG
1322         /* Copy over the MD5 key from the original socket */
1323         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1324                                 AF_INET);
1325         if (key) {
1326                 /*
1327                  * We're using one, so create a matching key
1328                  * on the newsk structure. If we fail to get
1329                  * memory, then we end up not copying the key
1330                  * across. Shucks.
1331                  */
1332                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1333                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1334                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1335         }
1336 #endif
1337
1338         if (__inet_inherit_port(sk, newsk) < 0)
1339                 goto put_and_exit;
1340         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1341         if (*own_req)
1342                 tcp_move_syn(newtp, req);
1343
1344         return newsk;
1345
1346 exit_overflow:
1347         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1348 exit_nonewsk:
1349         dst_release(dst);
1350 exit:
1351         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1352         return NULL;
1353 put_and_exit:
1354         inet_csk_prepare_forced_close(newsk);
1355         tcp_done(newsk);
1356         goto exit;
1357 }
1358 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1359
1360 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1361 {
1362 #ifdef CONFIG_SYN_COOKIES
1363         const struct tcphdr *th = tcp_hdr(skb);
1364
1365         if (!th->syn)
1366                 sk = cookie_v4_check(sk, skb);
1367 #endif
1368         return sk;
1369 }
1370
1371 /* The socket must have it's spinlock held when we get
1372  * here, unless it is a TCP_LISTEN socket.
1373  *
1374  * We have a potential double-lock case here, so even when
1375  * doing backlog processing we use the BH locking scheme.
1376  * This is because we cannot sleep with the original spinlock
1377  * held.
1378  */
1379 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1380 {
1381         struct sock *rsk;
1382
1383         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1384                 struct dst_entry *dst = sk->sk_rx_dst;
1385
1386                 sock_rps_save_rxhash(sk, skb);
1387                 sk_mark_napi_id(sk, skb);
1388                 if (dst) {
1389                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1390                             !dst->ops->check(dst, 0)) {
1391                                 dst_release(dst);
1392                                 sk->sk_rx_dst = NULL;
1393                         }
1394                 }
1395                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1396                 return 0;
1397         }
1398
1399         if (tcp_checksum_complete(skb))
1400                 goto csum_err;
1401
1402         if (sk->sk_state == TCP_LISTEN) {
1403                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1404
1405                 if (!nsk)
1406                         goto discard;
1407                 if (nsk != sk) {
1408                         sock_rps_save_rxhash(nsk, skb);
1409                         sk_mark_napi_id(nsk, skb);
1410                         if (tcp_child_process(sk, nsk, skb)) {
1411                                 rsk = nsk;
1412                                 goto reset;
1413                         }
1414                         return 0;
1415                 }
1416         } else
1417                 sock_rps_save_rxhash(sk, skb);
1418
1419         if (tcp_rcv_state_process(sk, skb)) {
1420                 rsk = sk;
1421                 goto reset;
1422         }
1423         return 0;
1424
1425 reset:
1426         tcp_v4_send_reset(rsk, skb);
1427 discard:
1428         kfree_skb(skb);
1429         /* Be careful here. If this function gets more complicated and
1430          * gcc suffers from register pressure on the x86, sk (in %ebx)
1431          * might be destroyed here. This current version compiles correctly,
1432          * but you have been warned.
1433          */
1434         return 0;
1435
1436 csum_err:
1437         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1438         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1439         goto discard;
1440 }
1441 EXPORT_SYMBOL(tcp_v4_do_rcv);
1442
1443 void tcp_v4_early_demux(struct sk_buff *skb)
1444 {
1445         const struct iphdr *iph;
1446         const struct tcphdr *th;
1447         struct sock *sk;
1448
1449         if (skb->pkt_type != PACKET_HOST)
1450                 return;
1451
1452         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1453                 return;
1454
1455         iph = ip_hdr(skb);
1456         th = tcp_hdr(skb);
1457
1458         if (th->doff < sizeof(struct tcphdr) / 4)
1459                 return;
1460
1461         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1462                                        iph->saddr, th->source,
1463                                        iph->daddr, ntohs(th->dest),
1464                                        skb->skb_iif);
1465         if (sk) {
1466                 skb->sk = sk;
1467                 skb->destructor = sock_edemux;
1468                 if (sk_fullsock(sk)) {
1469                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1470
1471                         if (dst)
1472                                 dst = dst_check(dst, 0);
1473                         if (dst &&
1474                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1475                                 skb_dst_set_noref(skb, dst);
1476                 }
1477         }
1478 }
1479
1480 /* Packet is added to VJ-style prequeue for processing in process
1481  * context, if a reader task is waiting. Apparently, this exciting
1482  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1483  * failed somewhere. Latency? Burstiness? Well, at least now we will
1484  * see, why it failed. 8)8)                               --ANK
1485  *
1486  */
1487 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1488 {
1489         struct tcp_sock *tp = tcp_sk(sk);
1490
1491         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1492                 return false;
1493
1494         if (skb->len <= tcp_hdrlen(skb) &&
1495             skb_queue_len(&tp->ucopy.prequeue) == 0)
1496                 return false;
1497
1498         /* Before escaping RCU protected region, we need to take care of skb
1499          * dst. Prequeue is only enabled for established sockets.
1500          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1501          * Instead of doing full sk_rx_dst validity here, let's perform
1502          * an optimistic check.
1503          */
1504         if (likely(sk->sk_rx_dst))
1505                 skb_dst_drop(skb);
1506         else
1507                 skb_dst_force_safe(skb);
1508
1509         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1510         tp->ucopy.memory += skb->truesize;
1511         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1512                 struct sk_buff *skb1;
1513
1514                 BUG_ON(sock_owned_by_user(sk));
1515
1516                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1517                         sk_backlog_rcv(sk, skb1);
1518                         NET_INC_STATS_BH(sock_net(sk),
1519                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1520                 }
1521
1522                 tp->ucopy.memory = 0;
1523         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1524                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1525                                            POLLIN | POLLRDNORM | POLLRDBAND);
1526                 if (!inet_csk_ack_scheduled(sk))
1527                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1528                                                   (3 * tcp_rto_min(sk)) / 4,
1529                                                   TCP_RTO_MAX);
1530         }
1531         return true;
1532 }
1533 EXPORT_SYMBOL(tcp_prequeue);
1534
1535 /*
1536  *      From tcp_input.c
1537  */
1538
1539 int tcp_v4_rcv(struct sk_buff *skb)
1540 {
1541         const struct iphdr *iph;
1542         const struct tcphdr *th;
1543         struct sock *sk;
1544         int ret;
1545         struct net *net = dev_net(skb->dev);
1546
1547         if (skb->pkt_type != PACKET_HOST)
1548                 goto discard_it;
1549
1550         /* Count it even if it's bad */
1551         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1552
1553         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554                 goto discard_it;
1555
1556         th = tcp_hdr(skb);
1557
1558         if (th->doff < sizeof(struct tcphdr) / 4)
1559                 goto bad_packet;
1560         if (!pskb_may_pull(skb, th->doff * 4))
1561                 goto discard_it;
1562
1563         /* An explanation is required here, I think.
1564          * Packet length and doff are validated by header prediction,
1565          * provided case of th->doff==0 is eliminated.
1566          * So, we defer the checks. */
1567
1568         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1569                 goto csum_error;
1570
1571         th = tcp_hdr(skb);
1572         iph = ip_hdr(skb);
1573         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1574          * barrier() makes sure compiler wont play fool^Waliasing games.
1575          */
1576         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1577                 sizeof(struct inet_skb_parm));
1578         barrier();
1579
1580         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1581         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1582                                     skb->len - th->doff * 4);
1583         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1584         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1585         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1586         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1587         TCP_SKB_CB(skb)->sacked  = 0;
1588
1589 lookup:
1590         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1591                                th->dest);
1592         if (!sk)
1593                 goto no_tcp_socket;
1594
1595 process:
1596         if (sk->sk_state == TCP_TIME_WAIT)
1597                 goto do_time_wait;
1598
1599         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1600                 struct request_sock *req = inet_reqsk(sk);
1601                 struct sock *nsk;
1602
1603                 sk = req->rsk_listener;
1604                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1605                         reqsk_put(req);
1606                         goto discard_it;
1607                 }
1608                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1609                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1610                         goto lookup;
1611                 }
1612                 sock_hold(sk);
1613                 nsk = tcp_check_req(sk, skb, req, false);
1614                 if (!nsk) {
1615                         reqsk_put(req);
1616                         goto discard_and_relse;
1617                 }
1618                 if (nsk == sk) {
1619                         reqsk_put(req);
1620                 } else if (tcp_child_process(sk, nsk, skb)) {
1621                         tcp_v4_send_reset(nsk, skb);
1622                         goto discard_and_relse;
1623                 } else {
1624                         sock_put(sk);
1625                         return 0;
1626                 }
1627         }
1628         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1629                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1630                 goto discard_and_relse;
1631         }
1632
1633         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1634                 goto discard_and_relse;
1635
1636         if (tcp_v4_inbound_md5_hash(sk, skb))
1637                 goto discard_and_relse;
1638
1639         nf_reset(skb);
1640
1641         if (sk_filter(sk, skb))
1642                 goto discard_and_relse;
1643
1644         skb->dev = NULL;
1645
1646         if (sk->sk_state == TCP_LISTEN) {
1647                 ret = tcp_v4_do_rcv(sk, skb);
1648                 goto put_and_return;
1649         }
1650
1651         sk_incoming_cpu_update(sk);
1652
1653         bh_lock_sock_nested(sk);
1654         tcp_segs_in(tcp_sk(sk), skb);
1655         ret = 0;
1656         if (!sock_owned_by_user(sk)) {
1657                 if (!tcp_prequeue(sk, skb))
1658                         ret = tcp_v4_do_rcv(sk, skb);
1659         } else if (unlikely(sk_add_backlog(sk, skb,
1660                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1661                 bh_unlock_sock(sk);
1662                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1663                 goto discard_and_relse;
1664         }
1665         bh_unlock_sock(sk);
1666
1667 put_and_return:
1668         sock_put(sk);
1669
1670         return ret;
1671
1672 no_tcp_socket:
1673         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1674                 goto discard_it;
1675
1676         if (tcp_checksum_complete(skb)) {
1677 csum_error:
1678                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1679 bad_packet:
1680                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1681         } else {
1682                 tcp_v4_send_reset(NULL, skb);
1683         }
1684
1685 discard_it:
1686         /* Discard frame. */
1687         kfree_skb(skb);
1688         return 0;
1689
1690 discard_and_relse:
1691         sock_put(sk);
1692         goto discard_it;
1693
1694 do_time_wait:
1695         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1696                 inet_twsk_put(inet_twsk(sk));
1697                 goto discard_it;
1698         }
1699
1700         if (tcp_checksum_complete(skb)) {
1701                 inet_twsk_put(inet_twsk(sk));
1702                 goto csum_error;
1703         }
1704         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1705         case TCP_TW_SYN: {
1706                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1707                                                         &tcp_hashinfo, skb,
1708                                                         __tcp_hdrlen(th),
1709                                                         iph->saddr, th->source,
1710                                                         iph->daddr, th->dest,
1711                                                         inet_iif(skb));
1712                 if (sk2) {
1713                         inet_twsk_deschedule_put(inet_twsk(sk));
1714                         sk = sk2;
1715                         goto process;
1716                 }
1717                 /* Fall through to ACK */
1718         }
1719         case TCP_TW_ACK:
1720                 tcp_v4_timewait_ack(sk, skb);
1721                 break;
1722         case TCP_TW_RST:
1723                 tcp_v4_send_reset(sk, skb);
1724                 inet_twsk_deschedule_put(inet_twsk(sk));
1725                 goto discard_it;
1726         case TCP_TW_SUCCESS:;
1727         }
1728         goto discard_it;
1729 }
1730
1731 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1732         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1733         .twsk_unique    = tcp_twsk_unique,
1734         .twsk_destructor= tcp_twsk_destructor,
1735 };
1736
1737 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1738 {
1739         struct dst_entry *dst = skb_dst(skb);
1740
1741         if (dst && dst_hold_safe(dst)) {
1742                 sk->sk_rx_dst = dst;
1743                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1744         }
1745 }
1746 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1747
1748 const struct inet_connection_sock_af_ops ipv4_specific = {
1749         .queue_xmit        = ip_queue_xmit,
1750         .send_check        = tcp_v4_send_check,
1751         .rebuild_header    = inet_sk_rebuild_header,
1752         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1753         .conn_request      = tcp_v4_conn_request,
1754         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1755         .net_header_len    = sizeof(struct iphdr),
1756         .setsockopt        = ip_setsockopt,
1757         .getsockopt        = ip_getsockopt,
1758         .addr2sockaddr     = inet_csk_addr2sockaddr,
1759         .sockaddr_len      = sizeof(struct sockaddr_in),
1760         .bind_conflict     = inet_csk_bind_conflict,
1761 #ifdef CONFIG_COMPAT
1762         .compat_setsockopt = compat_ip_setsockopt,
1763         .compat_getsockopt = compat_ip_getsockopt,
1764 #endif
1765         .mtu_reduced       = tcp_v4_mtu_reduced,
1766 };
1767 EXPORT_SYMBOL(ipv4_specific);
1768
1769 #ifdef CONFIG_TCP_MD5SIG
1770 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1771         .md5_lookup             = tcp_v4_md5_lookup,
1772         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1773         .md5_parse              = tcp_v4_parse_md5_keys,
1774 };
1775 #endif
1776
1777 /* NOTE: A lot of things set to zero explicitly by call to
1778  *       sk_alloc() so need not be done here.
1779  */
1780 static int tcp_v4_init_sock(struct sock *sk)
1781 {
1782         struct inet_connection_sock *icsk = inet_csk(sk);
1783
1784         tcp_init_sock(sk);
1785
1786         icsk->icsk_af_ops = &ipv4_specific;
1787
1788 #ifdef CONFIG_TCP_MD5SIG
1789         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1790 #endif
1791
1792         return 0;
1793 }
1794
1795 void tcp_v4_destroy_sock(struct sock *sk)
1796 {
1797         struct tcp_sock *tp = tcp_sk(sk);
1798
1799         tcp_clear_xmit_timers(sk);
1800
1801         tcp_cleanup_congestion_control(sk);
1802
1803         /* Cleanup up the write buffer. */
1804         tcp_write_queue_purge(sk);
1805
1806         /* Cleans up our, hopefully empty, out_of_order_queue. */
1807         __skb_queue_purge(&tp->out_of_order_queue);
1808
1809 #ifdef CONFIG_TCP_MD5SIG
1810         /* Clean up the MD5 key list, if any */
1811         if (tp->md5sig_info) {
1812                 tcp_clear_md5_list(sk);
1813                 kfree_rcu(tp->md5sig_info, rcu);
1814                 tp->md5sig_info = NULL;
1815         }
1816 #endif
1817
1818         /* Clean prequeue, it must be empty really */
1819         __skb_queue_purge(&tp->ucopy.prequeue);
1820
1821         /* Clean up a referenced TCP bind bucket. */
1822         if (inet_csk(sk)->icsk_bind_hash)
1823                 inet_put_port(sk);
1824
1825         BUG_ON(tp->fastopen_rsk);
1826
1827         /* If socket is aborted during connect operation */
1828         tcp_free_fastopen_req(tp);
1829         tcp_saved_syn_free(tp);
1830
1831         sk_sockets_allocated_dec(sk);
1832
1833         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1834                 sock_release_memcg(sk);
1835 }
1836 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1837
1838 #ifdef CONFIG_PROC_FS
1839 /* Proc filesystem TCP sock list dumping. */
1840
1841 /*
1842  * Get next listener socket follow cur.  If cur is NULL, get first socket
1843  * starting from bucket given in st->bucket; when st->bucket is zero the
1844  * very first socket in the hash table is returned.
1845  */
1846 static void *listening_get_next(struct seq_file *seq, void *cur)
1847 {
1848         struct inet_connection_sock *icsk;
1849         struct hlist_nulls_node *node;
1850         struct sock *sk = cur;
1851         struct inet_listen_hashbucket *ilb;
1852         struct tcp_iter_state *st = seq->private;
1853         struct net *net = seq_file_net(seq);
1854
1855         if (!sk) {
1856                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1857                 spin_lock_bh(&ilb->lock);
1858                 sk = sk_nulls_head(&ilb->head);
1859                 st->offset = 0;
1860                 goto get_sk;
1861         }
1862         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1863         ++st->num;
1864         ++st->offset;
1865
1866         sk = sk_nulls_next(sk);
1867 get_sk:
1868         sk_nulls_for_each_from(sk, node) {
1869                 if (!net_eq(sock_net(sk), net))
1870                         continue;
1871                 if (sk->sk_family == st->family) {
1872                         cur = sk;
1873                         goto out;
1874                 }
1875                 icsk = inet_csk(sk);
1876         }
1877         spin_unlock_bh(&ilb->lock);
1878         st->offset = 0;
1879         if (++st->bucket < INET_LHTABLE_SIZE) {
1880                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1881                 spin_lock_bh(&ilb->lock);
1882                 sk = sk_nulls_head(&ilb->head);
1883                 goto get_sk;
1884         }
1885         cur = NULL;
1886 out:
1887         return cur;
1888 }
1889
1890 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1891 {
1892         struct tcp_iter_state *st = seq->private;
1893         void *rc;
1894
1895         st->bucket = 0;
1896         st->offset = 0;
1897         rc = listening_get_next(seq, NULL);
1898
1899         while (rc && *pos) {
1900                 rc = listening_get_next(seq, rc);
1901                 --*pos;
1902         }
1903         return rc;
1904 }
1905
1906 static inline bool empty_bucket(const struct tcp_iter_state *st)
1907 {
1908         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1909 }
1910
1911 /*
1912  * Get first established socket starting from bucket given in st->bucket.
1913  * If st->bucket is zero, the very first socket in the hash is returned.
1914  */
1915 static void *established_get_first(struct seq_file *seq)
1916 {
1917         struct tcp_iter_state *st = seq->private;
1918         struct net *net = seq_file_net(seq);
1919         void *rc = NULL;
1920
1921         st->offset = 0;
1922         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1923                 struct sock *sk;
1924                 struct hlist_nulls_node *node;
1925                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1926
1927                 /* Lockless fast path for the common case of empty buckets */
1928                 if (empty_bucket(st))
1929                         continue;
1930
1931                 spin_lock_bh(lock);
1932                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1933                         if (sk->sk_family != st->family ||
1934                             !net_eq(sock_net(sk), net)) {
1935                                 continue;
1936                         }
1937                         rc = sk;
1938                         goto out;
1939                 }
1940                 spin_unlock_bh(lock);
1941         }
1942 out:
1943         return rc;
1944 }
1945
1946 static void *established_get_next(struct seq_file *seq, void *cur)
1947 {
1948         struct sock *sk = cur;
1949         struct hlist_nulls_node *node;
1950         struct tcp_iter_state *st = seq->private;
1951         struct net *net = seq_file_net(seq);
1952
1953         ++st->num;
1954         ++st->offset;
1955
1956         sk = sk_nulls_next(sk);
1957
1958         sk_nulls_for_each_from(sk, node) {
1959                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1960                         return sk;
1961         }
1962
1963         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1964         ++st->bucket;
1965         return established_get_first(seq);
1966 }
1967
1968 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1969 {
1970         struct tcp_iter_state *st = seq->private;
1971         void *rc;
1972
1973         st->bucket = 0;
1974         rc = established_get_first(seq);
1975
1976         while (rc && pos) {
1977                 rc = established_get_next(seq, rc);
1978                 --pos;
1979         }
1980         return rc;
1981 }
1982
1983 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1984 {
1985         void *rc;
1986         struct tcp_iter_state *st = seq->private;
1987
1988         st->state = TCP_SEQ_STATE_LISTENING;
1989         rc        = listening_get_idx(seq, &pos);
1990
1991         if (!rc) {
1992                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1993                 rc        = established_get_idx(seq, pos);
1994         }
1995
1996         return rc;
1997 }
1998
1999 static void *tcp_seek_last_pos(struct seq_file *seq)
2000 {
2001         struct tcp_iter_state *st = seq->private;
2002         int offset = st->offset;
2003         int orig_num = st->num;
2004         void *rc = NULL;
2005
2006         switch (st->state) {
2007         case TCP_SEQ_STATE_LISTENING:
2008                 if (st->bucket >= INET_LHTABLE_SIZE)
2009                         break;
2010                 st->state = TCP_SEQ_STATE_LISTENING;
2011                 rc = listening_get_next(seq, NULL);
2012                 while (offset-- && rc)
2013                         rc = listening_get_next(seq, rc);
2014                 if (rc)
2015                         break;
2016                 st->bucket = 0;
2017                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2018                 /* Fallthrough */
2019         case TCP_SEQ_STATE_ESTABLISHED:
2020                 if (st->bucket > tcp_hashinfo.ehash_mask)
2021                         break;
2022                 rc = established_get_first(seq);
2023                 while (offset-- && rc)
2024                         rc = established_get_next(seq, rc);
2025         }
2026
2027         st->num = orig_num;
2028
2029         return rc;
2030 }
2031
2032 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2033 {
2034         struct tcp_iter_state *st = seq->private;
2035         void *rc;
2036
2037         if (*pos && *pos == st->last_pos) {
2038                 rc = tcp_seek_last_pos(seq);
2039                 if (rc)
2040                         goto out;
2041         }
2042
2043         st->state = TCP_SEQ_STATE_LISTENING;
2044         st->num = 0;
2045         st->bucket = 0;
2046         st->offset = 0;
2047         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2048
2049 out:
2050         st->last_pos = *pos;
2051         return rc;
2052 }
2053
2054 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2055 {
2056         struct tcp_iter_state *st = seq->private;
2057         void *rc = NULL;
2058
2059         if (v == SEQ_START_TOKEN) {
2060                 rc = tcp_get_idx(seq, 0);
2061                 goto out;
2062         }
2063
2064         switch (st->state) {
2065         case TCP_SEQ_STATE_LISTENING:
2066                 rc = listening_get_next(seq, v);
2067                 if (!rc) {
2068                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2069                         st->bucket = 0;
2070                         st->offset = 0;
2071                         rc        = established_get_first(seq);
2072                 }
2073                 break;
2074         case TCP_SEQ_STATE_ESTABLISHED:
2075                 rc = established_get_next(seq, v);
2076                 break;
2077         }
2078 out:
2079         ++*pos;
2080         st->last_pos = *pos;
2081         return rc;
2082 }
2083
2084 static void tcp_seq_stop(struct seq_file *seq, void *v)
2085 {
2086         struct tcp_iter_state *st = seq->private;
2087
2088         switch (st->state) {
2089         case TCP_SEQ_STATE_LISTENING:
2090                 if (v != SEQ_START_TOKEN)
2091                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2092                 break;
2093         case TCP_SEQ_STATE_ESTABLISHED:
2094                 if (v)
2095                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2096                 break;
2097         }
2098 }
2099
2100 int tcp_seq_open(struct inode *inode, struct file *file)
2101 {
2102         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2103         struct tcp_iter_state *s;
2104         int err;
2105
2106         err = seq_open_net(inode, file, &afinfo->seq_ops,
2107                           sizeof(struct tcp_iter_state));
2108         if (err < 0)
2109                 return err;
2110
2111         s = ((struct seq_file *)file->private_data)->private;
2112         s->family               = afinfo->family;
2113         s->last_pos             = 0;
2114         return 0;
2115 }
2116 EXPORT_SYMBOL(tcp_seq_open);
2117
2118 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2119 {
2120         int rc = 0;
2121         struct proc_dir_entry *p;
2122
2123         afinfo->seq_ops.start           = tcp_seq_start;
2124         afinfo->seq_ops.next            = tcp_seq_next;
2125         afinfo->seq_ops.stop            = tcp_seq_stop;
2126
2127         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2128                              afinfo->seq_fops, afinfo);
2129         if (!p)
2130                 rc = -ENOMEM;
2131         return rc;
2132 }
2133 EXPORT_SYMBOL(tcp_proc_register);
2134
2135 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2136 {
2137         remove_proc_entry(afinfo->name, net->proc_net);
2138 }
2139 EXPORT_SYMBOL(tcp_proc_unregister);
2140
2141 static void get_openreq4(const struct request_sock *req,
2142                          struct seq_file *f, int i)
2143 {
2144         const struct inet_request_sock *ireq = inet_rsk(req);
2145         long delta = req->rsk_timer.expires - jiffies;
2146
2147         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2148                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2149                 i,
2150                 ireq->ir_loc_addr,
2151                 ireq->ir_num,
2152                 ireq->ir_rmt_addr,
2153                 ntohs(ireq->ir_rmt_port),
2154                 TCP_SYN_RECV,
2155                 0, 0, /* could print option size, but that is af dependent. */
2156                 1,    /* timers active (only the expire timer) */
2157                 jiffies_delta_to_clock_t(delta),
2158                 req->num_timeout,
2159                 from_kuid_munged(seq_user_ns(f),
2160                                  sock_i_uid(req->rsk_listener)),
2161                 0,  /* non standard timer */
2162                 0, /* open_requests have no inode */
2163                 0,
2164                 req);
2165 }
2166
2167 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2168 {
2169         int timer_active;
2170         unsigned long timer_expires;
2171         const struct tcp_sock *tp = tcp_sk(sk);
2172         const struct inet_connection_sock *icsk = inet_csk(sk);
2173         const struct inet_sock *inet = inet_sk(sk);
2174         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2175         __be32 dest = inet->inet_daddr;
2176         __be32 src = inet->inet_rcv_saddr;
2177         __u16 destp = ntohs(inet->inet_dport);
2178         __u16 srcp = ntohs(inet->inet_sport);
2179         int rx_queue;
2180         int state;
2181
2182         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2183             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2184             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2185                 timer_active    = 1;
2186                 timer_expires   = icsk->icsk_timeout;
2187         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2188                 timer_active    = 4;
2189                 timer_expires   = icsk->icsk_timeout;
2190         } else if (timer_pending(&sk->sk_timer)) {
2191                 timer_active    = 2;
2192                 timer_expires   = sk->sk_timer.expires;
2193         } else {
2194                 timer_active    = 0;
2195                 timer_expires = jiffies;
2196         }
2197
2198         state = sk_state_load(sk);
2199         if (state == TCP_LISTEN)
2200                 rx_queue = sk->sk_ack_backlog;
2201         else
2202                 /* Because we don't lock the socket,
2203                  * we might find a transient negative value.
2204                  */
2205                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2206
2207         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2208                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2209                 i, src, srcp, dest, destp, state,
2210                 tp->write_seq - tp->snd_una,
2211                 rx_queue,
2212                 timer_active,
2213                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2214                 icsk->icsk_retransmits,
2215                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2216                 icsk->icsk_probes_out,
2217                 sock_i_ino(sk),
2218                 atomic_read(&sk->sk_refcnt), sk,
2219                 jiffies_to_clock_t(icsk->icsk_rto),
2220                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2221                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2222                 tp->snd_cwnd,
2223                 state == TCP_LISTEN ?
2224                     fastopenq->max_qlen :
2225                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2226 }
2227
2228 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2229                                struct seq_file *f, int i)
2230 {
2231         long delta = tw->tw_timer.expires - jiffies;
2232         __be32 dest, src;
2233         __u16 destp, srcp;
2234
2235         dest  = tw->tw_daddr;
2236         src   = tw->tw_rcv_saddr;
2237         destp = ntohs(tw->tw_dport);
2238         srcp  = ntohs(tw->tw_sport);
2239
2240         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2241                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2242                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2243                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2244                 atomic_read(&tw->tw_refcnt), tw);
2245 }
2246
2247 #define TMPSZ 150
2248
2249 static int tcp4_seq_show(struct seq_file *seq, void *v)
2250 {
2251         struct tcp_iter_state *st;
2252         struct sock *sk = v;
2253
2254         seq_setwidth(seq, TMPSZ - 1);
2255         if (v == SEQ_START_TOKEN) {
2256                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2257                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2258                            "inode");
2259                 goto out;
2260         }
2261         st = seq->private;
2262
2263         if (sk->sk_state == TCP_TIME_WAIT)
2264                 get_timewait4_sock(v, seq, st->num);
2265         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2266                 get_openreq4(v, seq, st->num);
2267         else
2268                 get_tcp4_sock(v, seq, st->num);
2269 out:
2270         seq_pad(seq, '\n');
2271         return 0;
2272 }
2273
2274 static const struct file_operations tcp_afinfo_seq_fops = {
2275         .owner   = THIS_MODULE,
2276         .open    = tcp_seq_open,
2277         .read    = seq_read,
2278         .llseek  = seq_lseek,
2279         .release = seq_release_net
2280 };
2281
2282 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2283         .name           = "tcp",
2284         .family         = AF_INET,
2285         .seq_fops       = &tcp_afinfo_seq_fops,
2286         .seq_ops        = {
2287                 .show           = tcp4_seq_show,
2288         },
2289 };
2290
2291 static int __net_init tcp4_proc_init_net(struct net *net)
2292 {
2293         return tcp_proc_register(net, &tcp4_seq_afinfo);
2294 }
2295
2296 static void __net_exit tcp4_proc_exit_net(struct net *net)
2297 {
2298         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2299 }
2300
2301 static struct pernet_operations tcp4_net_ops = {
2302         .init = tcp4_proc_init_net,
2303         .exit = tcp4_proc_exit_net,
2304 };
2305
2306 int __init tcp4_proc_init(void)
2307 {
2308         return register_pernet_subsys(&tcp4_net_ops);
2309 }
2310
2311 void tcp4_proc_exit(void)
2312 {
2313         unregister_pernet_subsys(&tcp4_net_ops);
2314 }
2315 #endif /* CONFIG_PROC_FS */
2316
2317 struct proto tcp_prot = {
2318         .name                   = "TCP",
2319         .owner                  = THIS_MODULE,
2320         .close                  = tcp_close,
2321         .connect                = tcp_v4_connect,
2322         .disconnect             = tcp_disconnect,
2323         .accept                 = inet_csk_accept,
2324         .ioctl                  = tcp_ioctl,
2325         .init                   = tcp_v4_init_sock,
2326         .destroy                = tcp_v4_destroy_sock,
2327         .shutdown               = tcp_shutdown,
2328         .setsockopt             = tcp_setsockopt,
2329         .getsockopt             = tcp_getsockopt,
2330         .recvmsg                = tcp_recvmsg,
2331         .sendmsg                = tcp_sendmsg,
2332         .sendpage               = tcp_sendpage,
2333         .backlog_rcv            = tcp_v4_do_rcv,
2334         .release_cb             = tcp_release_cb,
2335         .hash                   = inet_hash,
2336         .unhash                 = inet_unhash,
2337         .get_port               = inet_csk_get_port,
2338         .enter_memory_pressure  = tcp_enter_memory_pressure,
2339         .stream_memory_free     = tcp_stream_memory_free,
2340         .sockets_allocated      = &tcp_sockets_allocated,
2341         .orphan_count           = &tcp_orphan_count,
2342         .memory_allocated       = &tcp_memory_allocated,
2343         .memory_pressure        = &tcp_memory_pressure,
2344         .sysctl_mem             = sysctl_tcp_mem,
2345         .sysctl_wmem            = sysctl_tcp_wmem,
2346         .sysctl_rmem            = sysctl_tcp_rmem,
2347         .max_header             = MAX_TCP_HEADER,
2348         .obj_size               = sizeof(struct tcp_sock),
2349         .slab_flags             = SLAB_DESTROY_BY_RCU,
2350         .twsk_prot              = &tcp_timewait_sock_ops,
2351         .rsk_prot               = &tcp_request_sock_ops,
2352         .h.hashinfo             = &tcp_hashinfo,
2353         .no_autobind            = true,
2354 #ifdef CONFIG_COMPAT
2355         .compat_setsockopt      = compat_tcp_setsockopt,
2356         .compat_getsockopt      = compat_tcp_getsockopt,
2357 #endif
2358         .diag_destroy           = tcp_abort,
2359 };
2360 EXPORT_SYMBOL(tcp_prot);
2361
2362 static void __net_exit tcp_sk_exit(struct net *net)
2363 {
2364         int cpu;
2365
2366         for_each_possible_cpu(cpu)
2367                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2368         free_percpu(net->ipv4.tcp_sk);
2369 }
2370
2371 static int __net_init tcp_sk_init(struct net *net)
2372 {
2373         int res, cpu;
2374
2375         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2376         if (!net->ipv4.tcp_sk)
2377                 return -ENOMEM;
2378
2379         for_each_possible_cpu(cpu) {
2380                 struct sock *sk;
2381
2382                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2383                                            IPPROTO_TCP, net);
2384                 if (res)
2385                         goto fail;
2386                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2387         }
2388
2389         net->ipv4.sysctl_tcp_ecn = 2;
2390         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2391
2392         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2393         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2394         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2395
2396         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2397         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2398         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2399
2400         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2401         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2402         net->ipv4.sysctl_tcp_syncookies = 1;
2403         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2404         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2405         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2406         net->ipv4.sysctl_tcp_orphan_retries = 0;
2407         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2408         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2409
2410         return 0;
2411 fail:
2412         tcp_sk_exit(net);
2413
2414         return res;
2415 }
2416
2417 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2418 {
2419         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2420 }
2421
2422 static struct pernet_operations __net_initdata tcp_sk_ops = {
2423        .init       = tcp_sk_init,
2424        .exit       = tcp_sk_exit,
2425        .exit_batch = tcp_sk_exit_batch,
2426 };
2427
2428 void __init tcp_v4_init(void)
2429 {
2430         inet_hashinfo_init(&tcp_hashinfo);
2431         if (register_pernet_subsys(&tcp_sk_ops))
2432                 panic("Failed to create the TCP control socket.\n");
2433 }