Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[linux-2.6-microblaze.git] / net / netfilter / nf_conntrack_proto_tcp.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* (C) 1999-2001 Paul `Rusty' Russell
3  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4  * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
5  * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
6  */
7
8 #include <linux/types.h>
9 #include <linux/timer.h>
10 #include <linux/module.h>
11 #include <linux/in.h>
12 #include <linux/tcp.h>
13 #include <linux/spinlock.h>
14 #include <linux/skbuff.h>
15 #include <linux/ipv6.h>
16 #include <net/ip6_checksum.h>
17 #include <asm/unaligned.h>
18
19 #include <net/tcp.h>
20
21 #include <linux/netfilter.h>
22 #include <linux/netfilter_ipv4.h>
23 #include <linux/netfilter_ipv6.h>
24 #include <net/netfilter/nf_conntrack.h>
25 #include <net/netfilter/nf_conntrack_l4proto.h>
26 #include <net/netfilter/nf_conntrack_ecache.h>
27 #include <net/netfilter/nf_conntrack_seqadj.h>
28 #include <net/netfilter/nf_conntrack_synproxy.h>
29 #include <net/netfilter/nf_conntrack_timeout.h>
30 #include <net/netfilter/nf_log.h>
31 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
32 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
33
34   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
35      closely.  They're more complex. --RR */
36
37 static const char *const tcp_conntrack_names[] = {
38         "NONE",
39         "SYN_SENT",
40         "SYN_RECV",
41         "ESTABLISHED",
42         "FIN_WAIT",
43         "CLOSE_WAIT",
44         "LAST_ACK",
45         "TIME_WAIT",
46         "CLOSE",
47         "SYN_SENT2",
48 };
49
50 #define SECS * HZ
51 #define MINS * 60 SECS
52 #define HOURS * 60 MINS
53 #define DAYS * 24 HOURS
54
55 static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
56         [TCP_CONNTRACK_SYN_SENT]        = 2 MINS,
57         [TCP_CONNTRACK_SYN_RECV]        = 60 SECS,
58         [TCP_CONNTRACK_ESTABLISHED]     = 5 DAYS,
59         [TCP_CONNTRACK_FIN_WAIT]        = 2 MINS,
60         [TCP_CONNTRACK_CLOSE_WAIT]      = 60 SECS,
61         [TCP_CONNTRACK_LAST_ACK]        = 30 SECS,
62         [TCP_CONNTRACK_TIME_WAIT]       = 2 MINS,
63         [TCP_CONNTRACK_CLOSE]           = 10 SECS,
64         [TCP_CONNTRACK_SYN_SENT2]       = 2 MINS,
65 /* RFC1122 says the R2 limit should be at least 100 seconds.
66    Linux uses 15 packets as limit, which corresponds
67    to ~13-30min depending on RTO. */
68         [TCP_CONNTRACK_RETRANS]         = 5 MINS,
69         [TCP_CONNTRACK_UNACK]           = 5 MINS,
70 };
71
72 #define sNO TCP_CONNTRACK_NONE
73 #define sSS TCP_CONNTRACK_SYN_SENT
74 #define sSR TCP_CONNTRACK_SYN_RECV
75 #define sES TCP_CONNTRACK_ESTABLISHED
76 #define sFW TCP_CONNTRACK_FIN_WAIT
77 #define sCW TCP_CONNTRACK_CLOSE_WAIT
78 #define sLA TCP_CONNTRACK_LAST_ACK
79 #define sTW TCP_CONNTRACK_TIME_WAIT
80 #define sCL TCP_CONNTRACK_CLOSE
81 #define sS2 TCP_CONNTRACK_SYN_SENT2
82 #define sIV TCP_CONNTRACK_MAX
83 #define sIG TCP_CONNTRACK_IGNORE
84
85 /* What TCP flags are set from RST/SYN/FIN/ACK. */
86 enum tcp_bit_set {
87         TCP_SYN_SET,
88         TCP_SYNACK_SET,
89         TCP_FIN_SET,
90         TCP_ACK_SET,
91         TCP_RST_SET,
92         TCP_NONE_SET,
93 };
94
95 /*
96  * The TCP state transition table needs a few words...
97  *
98  * We are the man in the middle. All the packets go through us
99  * but might get lost in transit to the destination.
100  * It is assumed that the destinations can't receive segments
101  * we haven't seen.
102  *
103  * The checked segment is in window, but our windows are *not*
104  * equivalent with the ones of the sender/receiver. We always
105  * try to guess the state of the current sender.
106  *
107  * The meaning of the states are:
108  *
109  * NONE:        initial state
110  * SYN_SENT:    SYN-only packet seen
111  * SYN_SENT2:   SYN-only packet seen from reply dir, simultaneous open
112  * SYN_RECV:    SYN-ACK packet seen
113  * ESTABLISHED: ACK packet seen
114  * FIN_WAIT:    FIN packet seen
115  * CLOSE_WAIT:  ACK seen (after FIN)
116  * LAST_ACK:    FIN seen (after FIN)
117  * TIME_WAIT:   last ACK seen
118  * CLOSE:       closed connection (RST)
119  *
120  * Packets marked as IGNORED (sIG):
121  *      if they may be either invalid or valid
122  *      and the receiver may send back a connection
123  *      closing RST or a SYN/ACK.
124  *
125  * Packets marked as INVALID (sIV):
126  *      if we regard them as truly invalid packets
127  */
128 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
129         {
130 /* ORIGINAL */
131 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
132 /*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
133 /*
134  *      sNO -> sSS      Initialize a new connection
135  *      sSS -> sSS      Retransmitted SYN
136  *      sS2 -> sS2      Late retransmitted SYN
137  *      sSR -> sIG
138  *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
139  *                      are errors. Receiver will reply with RST
140  *                      and close the connection.
141  *                      Or we are not in sync and hold a dead connection.
142  *      sFW -> sIG
143  *      sCW -> sIG
144  *      sLA -> sIG
145  *      sTW -> sSS      Reopened connection (RFC 1122).
146  *      sCL -> sSS
147  */
148 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
149 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
150 /*
151  *      sNO -> sIV      Too late and no reason to do anything
152  *      sSS -> sIV      Client can't send SYN and then SYN/ACK
153  *      sS2 -> sSR      SYN/ACK sent to SYN2 in simultaneous open
154  *      sSR -> sSR      Late retransmitted SYN/ACK in simultaneous open
155  *      sES -> sIV      Invalid SYN/ACK packets sent by the client
156  *      sFW -> sIV
157  *      sCW -> sIV
158  *      sLA -> sIV
159  *      sTW -> sIV
160  *      sCL -> sIV
161  */
162 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
163 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
164 /*
165  *      sNO -> sIV      Too late and no reason to do anything...
166  *      sSS -> sIV      Client migth not send FIN in this state:
167  *                      we enforce waiting for a SYN/ACK reply first.
168  *      sS2 -> sIV
169  *      sSR -> sFW      Close started.
170  *      sES -> sFW
171  *      sFW -> sLA      FIN seen in both directions, waiting for
172  *                      the last ACK.
173  *                      Migth be a retransmitted FIN as well...
174  *      sCW -> sLA
175  *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
176  *      sTW -> sTW
177  *      sCL -> sCL
178  */
179 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
180 /*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
181 /*
182  *      sNO -> sES      Assumed.
183  *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
184  *      sS2 -> sIV
185  *      sSR -> sES      Established state is reached.
186  *      sES -> sES      :-)
187  *      sFW -> sCW      Normal close request answered by ACK.
188  *      sCW -> sCW
189  *      sLA -> sTW      Last ACK detected (RFC5961 challenged)
190  *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
191  *      sCL -> sCL
192  */
193 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
194 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
195 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
196         },
197         {
198 /* REPLY */
199 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
200 /*syn*/    { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
201 /*
202  *      sNO -> sIV      Never reached.
203  *      sSS -> sS2      Simultaneous open
204  *      sS2 -> sS2      Retransmitted simultaneous SYN
205  *      sSR -> sIV      Invalid SYN packets sent by the server
206  *      sES -> sIV
207  *      sFW -> sIV
208  *      sCW -> sIV
209  *      sLA -> sIV
210  *      sTW -> sSS      Reopened connection, but server may have switched role
211  *      sCL -> sIV
212  */
213 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
214 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
215 /*
216  *      sSS -> sSR      Standard open.
217  *      sS2 -> sSR      Simultaneous open
218  *      sSR -> sIG      Retransmitted SYN/ACK, ignore it.
219  *      sES -> sIG      Late retransmitted SYN/ACK?
220  *      sFW -> sIG      Might be SYN/ACK answering ignored SYN
221  *      sCW -> sIG
222  *      sLA -> sIG
223  *      sTW -> sIG
224  *      sCL -> sIG
225  */
226 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
227 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
228 /*
229  *      sSS -> sIV      Server might not send FIN in this state.
230  *      sS2 -> sIV
231  *      sSR -> sFW      Close started.
232  *      sES -> sFW
233  *      sFW -> sLA      FIN seen in both directions.
234  *      sCW -> sLA
235  *      sLA -> sLA      Retransmitted FIN.
236  *      sTW -> sTW
237  *      sCL -> sCL
238  */
239 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
240 /*ack*/    { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
241 /*
242  *      sSS -> sIG      Might be a half-open connection.
243  *      sS2 -> sIG
244  *      sSR -> sSR      Might answer late resent SYN.
245  *      sES -> sES      :-)
246  *      sFW -> sCW      Normal close request answered by ACK.
247  *      sCW -> sCW
248  *      sLA -> sTW      Last ACK detected (RFC5961 challenged)
249  *      sTW -> sTW      Retransmitted last ACK.
250  *      sCL -> sCL
251  */
252 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
253 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
254 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
255         }
256 };
257
258 #ifdef CONFIG_NF_CONNTRACK_PROCFS
259 /* Print out the private part of the conntrack. */
260 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
261 {
262         if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
263                 return;
264
265         seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
266 }
267 #endif
268
269 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
270 {
271         if (tcph->rst) return TCP_RST_SET;
272         else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
273         else if (tcph->fin) return TCP_FIN_SET;
274         else if (tcph->ack) return TCP_ACK_SET;
275         else return TCP_NONE_SET;
276 }
277
278 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
279    in IP Filter' by Guido van Rooij.
280
281    http://www.sane.nl/events/sane2000/papers.html
282    http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
283
284    The boundaries and the conditions are changed according to RFC793:
285    the packet must intersect the window (i.e. segments may be
286    after the right or before the left edge) and thus receivers may ACK
287    segments after the right edge of the window.
288
289         td_maxend = max(sack + max(win,1)) seen in reply packets
290         td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
291         td_maxwin += seq + len - sender.td_maxend
292                         if seq + len > sender.td_maxend
293         td_end    = max(seq + len) seen in sent packets
294
295    I.   Upper bound for valid data:     seq <= sender.td_maxend
296    II.  Lower bound for valid data:     seq + len >= sender.td_end - receiver.td_maxwin
297    III. Upper bound for valid (s)ack:   sack <= receiver.td_end
298    IV.  Lower bound for valid (s)ack:   sack >= receiver.td_end - MAXACKWINDOW
299
300    where sack is the highest right edge of sack block found in the packet
301    or ack in the case of packet without SACK option.
302
303    The upper bound limit for a valid (s)ack is not ignored -
304    we doesn't have to deal with fragments.
305 */
306
307 static inline __u32 segment_seq_plus_len(__u32 seq,
308                                          size_t len,
309                                          unsigned int dataoff,
310                                          const struct tcphdr *tcph)
311 {
312         /* XXX Should I use payload length field in IP/IPv6 header ?
313          * - YK */
314         return (seq + len - dataoff - tcph->doff*4
315                 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
316 }
317
318 /* Fixme: what about big packets? */
319 #define MAXACKWINCONST                  66000
320 #define MAXACKWINDOW(sender)                                            \
321         ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
322                                               : MAXACKWINCONST)
323
324 /*
325  * Simplified tcp_parse_options routine from tcp_input.c
326  */
327 static void tcp_options(const struct sk_buff *skb,
328                         unsigned int dataoff,
329                         const struct tcphdr *tcph,
330                         struct ip_ct_tcp_state *state)
331 {
332         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
333         const unsigned char *ptr;
334         int length = (tcph->doff*4) - sizeof(struct tcphdr);
335
336         if (!length)
337                 return;
338
339         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
340                                  length, buff);
341         if (!ptr)
342                 return;
343
344         state->td_scale =
345         state->flags = 0;
346
347         while (length > 0) {
348                 int opcode=*ptr++;
349                 int opsize;
350
351                 switch (opcode) {
352                 case TCPOPT_EOL:
353                         return;
354                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
355                         length--;
356                         continue;
357                 default:
358                         if (length < 2)
359                                 return;
360                         opsize=*ptr++;
361                         if (opsize < 2) /* "silly options" */
362                                 return;
363                         if (opsize > length)
364                                 return; /* don't parse partial options */
365
366                         if (opcode == TCPOPT_SACK_PERM
367                             && opsize == TCPOLEN_SACK_PERM)
368                                 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
369                         else if (opcode == TCPOPT_WINDOW
370                                  && opsize == TCPOLEN_WINDOW) {
371                                 state->td_scale = *(u_int8_t *)ptr;
372
373                                 if (state->td_scale > TCP_MAX_WSCALE)
374                                         state->td_scale = TCP_MAX_WSCALE;
375
376                                 state->flags |=
377                                         IP_CT_TCP_FLAG_WINDOW_SCALE;
378                         }
379                         ptr += opsize - 2;
380                         length -= opsize;
381                 }
382         }
383 }
384
385 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
386                      const struct tcphdr *tcph, __u32 *sack)
387 {
388         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
389         const unsigned char *ptr;
390         int length = (tcph->doff*4) - sizeof(struct tcphdr);
391         __u32 tmp;
392
393         if (!length)
394                 return;
395
396         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
397                                  length, buff);
398         if (!ptr)
399                 return;
400
401         /* Fast path for timestamp-only option */
402         if (length == TCPOLEN_TSTAMP_ALIGNED
403             && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
404                                        | (TCPOPT_NOP << 16)
405                                        | (TCPOPT_TIMESTAMP << 8)
406                                        | TCPOLEN_TIMESTAMP))
407                 return;
408
409         while (length > 0) {
410                 int opcode = *ptr++;
411                 int opsize, i;
412
413                 switch (opcode) {
414                 case TCPOPT_EOL:
415                         return;
416                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
417                         length--;
418                         continue;
419                 default:
420                         if (length < 2)
421                                 return;
422                         opsize = *ptr++;
423                         if (opsize < 2) /* "silly options" */
424                                 return;
425                         if (opsize > length)
426                                 return; /* don't parse partial options */
427
428                         if (opcode == TCPOPT_SACK
429                             && opsize >= (TCPOLEN_SACK_BASE
430                                           + TCPOLEN_SACK_PERBLOCK)
431                             && !((opsize - TCPOLEN_SACK_BASE)
432                                  % TCPOLEN_SACK_PERBLOCK)) {
433                                 for (i = 0;
434                                      i < (opsize - TCPOLEN_SACK_BASE);
435                                      i += TCPOLEN_SACK_PERBLOCK) {
436                                         tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
437
438                                         if (after(tmp, *sack))
439                                                 *sack = tmp;
440                                 }
441                                 return;
442                         }
443                         ptr += opsize - 2;
444                         length -= opsize;
445                 }
446         }
447 }
448
449 static void tcp_init_sender(struct ip_ct_tcp_state *sender,
450                             struct ip_ct_tcp_state *receiver,
451                             const struct sk_buff *skb,
452                             unsigned int dataoff,
453                             const struct tcphdr *tcph,
454                             u32 end, u32 win)
455 {
456         /* SYN-ACK in reply to a SYN
457          * or SYN from reply direction in simultaneous open.
458          */
459         sender->td_end =
460         sender->td_maxend = end;
461         sender->td_maxwin = (win == 0 ? 1 : win);
462
463         tcp_options(skb, dataoff, tcph, sender);
464         /* RFC 1323:
465          * Both sides must send the Window Scale option
466          * to enable window scaling in either direction.
467          */
468         if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
469               receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
470                 sender->td_scale = 0;
471                 receiver->td_scale = 0;
472         }
473 }
474
475 static bool tcp_in_window(struct nf_conn *ct,
476                           enum ip_conntrack_dir dir,
477                           unsigned int index,
478                           const struct sk_buff *skb,
479                           unsigned int dataoff,
480                           const struct tcphdr *tcph,
481                           const struct nf_hook_state *hook_state)
482 {
483         struct ip_ct_tcp *state = &ct->proto.tcp;
484         struct net *net = nf_ct_net(ct);
485         struct nf_tcp_net *tn = nf_tcp_pernet(net);
486         struct ip_ct_tcp_state *sender = &state->seen[dir];
487         struct ip_ct_tcp_state *receiver = &state->seen[!dir];
488         const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
489         __u32 seq, ack, sack, end, win, swin;
490         u16 win_raw;
491         s32 receiver_offset;
492         bool res, in_recv_win;
493
494         /*
495          * Get the required data from the packet.
496          */
497         seq = ntohl(tcph->seq);
498         ack = sack = ntohl(tcph->ack_seq);
499         win_raw = ntohs(tcph->window);
500         win = win_raw;
501         end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
502
503         if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
504                 tcp_sack(skb, dataoff, tcph, &sack);
505
506         /* Take into account NAT sequence number mangling */
507         receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
508         ack -= receiver_offset;
509         sack -= receiver_offset;
510
511         pr_debug("tcp_in_window: START\n");
512         pr_debug("tcp_in_window: ");
513         nf_ct_dump_tuple(tuple);
514         pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
515                  seq, ack, receiver_offset, sack, receiver_offset, win, end);
516         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
517                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
518                  sender->td_end, sender->td_maxend, sender->td_maxwin,
519                  sender->td_scale,
520                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
521                  receiver->td_scale);
522
523         if (sender->td_maxwin == 0) {
524                 /*
525                  * Initialize sender data.
526                  */
527                 if (tcph->syn) {
528                         tcp_init_sender(sender, receiver,
529                                         skb, dataoff, tcph,
530                                         end, win);
531                         if (!tcph->ack)
532                                 /* Simultaneous open */
533                                 return true;
534                 } else {
535                         /*
536                          * We are in the middle of a connection,
537                          * its history is lost for us.
538                          * Let's try to use the data from the packet.
539                          */
540                         sender->td_end = end;
541                         swin = win << sender->td_scale;
542                         sender->td_maxwin = (swin == 0 ? 1 : swin);
543                         sender->td_maxend = end + sender->td_maxwin;
544                         if (receiver->td_maxwin == 0) {
545                                 /* We haven't seen traffic in the other
546                                  * direction yet but we have to tweak window
547                                  * tracking to pass III and IV until that
548                                  * happens.
549                                  */
550                                 receiver->td_end = receiver->td_maxend = sack;
551                         } else if (sack == receiver->td_end + 1) {
552                                 /* Likely a reply to a keepalive.
553                                  * Needed for III.
554                                  */
555                                 receiver->td_end++;
556                         }
557
558                 }
559         } else if (((state->state == TCP_CONNTRACK_SYN_SENT
560                      && dir == IP_CT_DIR_ORIGINAL)
561                    || (state->state == TCP_CONNTRACK_SYN_RECV
562                      && dir == IP_CT_DIR_REPLY))
563                    && after(end, sender->td_end)) {
564                 /*
565                  * RFC 793: "if a TCP is reinitialized ... then it need
566                  * not wait at all; it must only be sure to use sequence
567                  * numbers larger than those recently used."
568                  */
569                 sender->td_end =
570                 sender->td_maxend = end;
571                 sender->td_maxwin = (win == 0 ? 1 : win);
572
573                 tcp_options(skb, dataoff, tcph, sender);
574         } else if (tcph->syn && dir == IP_CT_DIR_REPLY &&
575                    state->state == TCP_CONNTRACK_SYN_SENT) {
576                 /* Retransmitted syn-ack, or syn (simultaneous open).
577                  *
578                  * Re-init state for this direction, just like for the first
579                  * syn(-ack) reply, it might differ in seq, ack or tcp options.
580                  */
581                 tcp_init_sender(sender, receiver,
582                                 skb, dataoff, tcph,
583                                 end, win);
584                 if (!tcph->ack)
585                         return true;
586         }
587
588         if (!(tcph->ack)) {
589                 /*
590                  * If there is no ACK, just pretend it was set and OK.
591                  */
592                 ack = sack = receiver->td_end;
593         } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
594                     (TCP_FLAG_ACK|TCP_FLAG_RST))
595                    && (ack == 0)) {
596                 /*
597                  * Broken TCP stacks, that set ACK in RST packets as well
598                  * with zero ack value.
599                  */
600                 ack = sack = receiver->td_end;
601         }
602
603         if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
604                 /*
605                  * RST sent answering SYN.
606                  */
607                 seq = end = sender->td_end;
608
609         pr_debug("tcp_in_window: ");
610         nf_ct_dump_tuple(tuple);
611         pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
612                  seq, ack, receiver_offset, sack, receiver_offset, win, end);
613         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
614                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
615                  sender->td_end, sender->td_maxend, sender->td_maxwin,
616                  sender->td_scale,
617                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
618                  receiver->td_scale);
619
620         /* Is the ending sequence in the receive window (if available)? */
621         in_recv_win = !receiver->td_maxwin ||
622                       after(end, sender->td_end - receiver->td_maxwin - 1);
623
624         pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
625                  before(seq, sender->td_maxend + 1),
626                  (in_recv_win ? 1 : 0),
627                  before(sack, receiver->td_end + 1),
628                  after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
629
630         if (before(seq, sender->td_maxend + 1) &&
631             in_recv_win &&
632             before(sack, receiver->td_end + 1) &&
633             after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
634                 /*
635                  * Take into account window scaling (RFC 1323).
636                  */
637                 if (!tcph->syn)
638                         win <<= sender->td_scale;
639
640                 /*
641                  * Update sender data.
642                  */
643                 swin = win + (sack - ack);
644                 if (sender->td_maxwin < swin)
645                         sender->td_maxwin = swin;
646                 if (after(end, sender->td_end)) {
647                         sender->td_end = end;
648                         sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
649                 }
650                 if (tcph->ack) {
651                         if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
652                                 sender->td_maxack = ack;
653                                 sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
654                         } else if (after(ack, sender->td_maxack))
655                                 sender->td_maxack = ack;
656                 }
657
658                 /*
659                  * Update receiver data.
660                  */
661                 if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
662                         receiver->td_maxwin += end - sender->td_maxend;
663                 if (after(sack + win, receiver->td_maxend - 1)) {
664                         receiver->td_maxend = sack + win;
665                         if (win == 0)
666                                 receiver->td_maxend++;
667                 }
668                 if (ack == receiver->td_end)
669                         receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
670
671                 /*
672                  * Check retransmissions.
673                  */
674                 if (index == TCP_ACK_SET) {
675                         if (state->last_dir == dir
676                             && state->last_seq == seq
677                             && state->last_ack == ack
678                             && state->last_end == end
679                             && state->last_win == win_raw)
680                                 state->retrans++;
681                         else {
682                                 state->last_dir = dir;
683                                 state->last_seq = seq;
684                                 state->last_ack = ack;
685                                 state->last_end = end;
686                                 state->last_win = win_raw;
687                                 state->retrans = 0;
688                         }
689                 }
690                 res = true;
691         } else {
692                 res = false;
693                 if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
694                     tn->tcp_be_liberal)
695                         res = true;
696                 if (!res) {
697                         nf_ct_l4proto_log_invalid(skb, ct, hook_state,
698                         "%s",
699                         before(seq, sender->td_maxend + 1) ?
700                         in_recv_win ?
701                         before(sack, receiver->td_end + 1) ?
702                         after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
703                         : "ACK is under the lower bound (possible overly delayed ACK)"
704                         : "ACK is over the upper bound (ACKed data not seen yet)"
705                         : "SEQ is under the lower bound (already ACKed data retransmitted)"
706                         : "SEQ is over the upper bound (over the window of the receiver)");
707                 }
708         }
709
710         pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
711                  "receiver end=%u maxend=%u maxwin=%u\n",
712                  res, sender->td_end, sender->td_maxend, sender->td_maxwin,
713                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
714
715         return res;
716 }
717
718 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
719 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
720                                  TCPHDR_URG) + 1] =
721 {
722         [TCPHDR_SYN]                            = 1,
723         [TCPHDR_SYN|TCPHDR_URG]                 = 1,
724         [TCPHDR_SYN|TCPHDR_ACK]                 = 1,
725         [TCPHDR_RST]                            = 1,
726         [TCPHDR_RST|TCPHDR_ACK]                 = 1,
727         [TCPHDR_FIN|TCPHDR_ACK]                 = 1,
728         [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]      = 1,
729         [TCPHDR_ACK]                            = 1,
730         [TCPHDR_ACK|TCPHDR_URG]                 = 1,
731 };
732
733 static void tcp_error_log(const struct sk_buff *skb,
734                           const struct nf_hook_state *state,
735                           const char *msg)
736 {
737         nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
738 }
739
740 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
741 static bool tcp_error(const struct tcphdr *th,
742                       struct sk_buff *skb,
743                       unsigned int dataoff,
744                       const struct nf_hook_state *state)
745 {
746         unsigned int tcplen = skb->len - dataoff;
747         u8 tcpflags;
748
749         /* Not whole TCP header or malformed packet */
750         if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
751                 tcp_error_log(skb, state, "truncated packet");
752                 return true;
753         }
754
755         /* Checksum invalid? Ignore.
756          * We skip checking packets on the outgoing path
757          * because the checksum is assumed to be correct.
758          */
759         /* FIXME: Source route IP option packets --RR */
760         if (state->net->ct.sysctl_checksum &&
761             state->hook == NF_INET_PRE_ROUTING &&
762             nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
763                 tcp_error_log(skb, state, "bad checksum");
764                 return true;
765         }
766
767         /* Check TCP flags. */
768         tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
769         if (!tcp_valid_flags[tcpflags]) {
770                 tcp_error_log(skb, state, "invalid tcp flag combination");
771                 return true;
772         }
773
774         return false;
775 }
776
777 static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
778                              unsigned int dataoff,
779                              const struct tcphdr *th)
780 {
781         enum tcp_conntrack new_state;
782         struct net *net = nf_ct_net(ct);
783         const struct nf_tcp_net *tn = nf_tcp_pernet(net);
784         const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
785         const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
786
787         /* Don't need lock here: this conntrack not in circulation yet */
788         new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
789
790         /* Invalid: delete conntrack */
791         if (new_state >= TCP_CONNTRACK_MAX) {
792                 pr_debug("nf_ct_tcp: invalid new deleting.\n");
793                 return false;
794         }
795
796         if (new_state == TCP_CONNTRACK_SYN_SENT) {
797                 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
798                 /* SYN packet */
799                 ct->proto.tcp.seen[0].td_end =
800                         segment_seq_plus_len(ntohl(th->seq), skb->len,
801                                              dataoff, th);
802                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
803                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
804                         ct->proto.tcp.seen[0].td_maxwin = 1;
805                 ct->proto.tcp.seen[0].td_maxend =
806                         ct->proto.tcp.seen[0].td_end;
807
808                 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
809         } else if (tn->tcp_loose == 0) {
810                 /* Don't try to pick up connections. */
811                 return false;
812         } else {
813                 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
814                 /*
815                  * We are in the middle of a connection,
816                  * its history is lost for us.
817                  * Let's try to use the data from the packet.
818                  */
819                 ct->proto.tcp.seen[0].td_end =
820                         segment_seq_plus_len(ntohl(th->seq), skb->len,
821                                              dataoff, th);
822                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
823                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
824                         ct->proto.tcp.seen[0].td_maxwin = 1;
825                 ct->proto.tcp.seen[0].td_maxend =
826                         ct->proto.tcp.seen[0].td_end +
827                         ct->proto.tcp.seen[0].td_maxwin;
828
829                 /* We assume SACK and liberal window checking to handle
830                  * window scaling */
831                 ct->proto.tcp.seen[0].flags =
832                 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
833                                               IP_CT_TCP_FLAG_BE_LIBERAL;
834         }
835
836         /* tcp_packet will set them */
837         ct->proto.tcp.last_index = TCP_NONE_SET;
838
839         pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
840                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
841                  __func__,
842                  sender->td_end, sender->td_maxend, sender->td_maxwin,
843                  sender->td_scale,
844                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
845                  receiver->td_scale);
846         return true;
847 }
848
849 static bool tcp_can_early_drop(const struct nf_conn *ct)
850 {
851         switch (ct->proto.tcp.state) {
852         case TCP_CONNTRACK_FIN_WAIT:
853         case TCP_CONNTRACK_LAST_ACK:
854         case TCP_CONNTRACK_TIME_WAIT:
855         case TCP_CONNTRACK_CLOSE:
856         case TCP_CONNTRACK_CLOSE_WAIT:
857                 return true;
858         default:
859                 break;
860         }
861
862         return false;
863 }
864
865 /* Returns verdict for packet, or -1 for invalid. */
866 int nf_conntrack_tcp_packet(struct nf_conn *ct,
867                             struct sk_buff *skb,
868                             unsigned int dataoff,
869                             enum ip_conntrack_info ctinfo,
870                             const struct nf_hook_state *state)
871 {
872         struct net *net = nf_ct_net(ct);
873         struct nf_tcp_net *tn = nf_tcp_pernet(net);
874         struct nf_conntrack_tuple *tuple;
875         enum tcp_conntrack new_state, old_state;
876         unsigned int index, *timeouts;
877         enum ip_conntrack_dir dir;
878         const struct tcphdr *th;
879         struct tcphdr _tcph;
880         unsigned long timeout;
881
882         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
883         if (th == NULL)
884                 return -NF_ACCEPT;
885
886         if (tcp_error(th, skb, dataoff, state))
887                 return -NF_ACCEPT;
888
889         if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
890                 return -NF_ACCEPT;
891
892         spin_lock_bh(&ct->lock);
893         old_state = ct->proto.tcp.state;
894         dir = CTINFO2DIR(ctinfo);
895         index = get_conntrack_index(th);
896         new_state = tcp_conntracks[dir][index][old_state];
897         tuple = &ct->tuplehash[dir].tuple;
898
899         switch (new_state) {
900         case TCP_CONNTRACK_SYN_SENT:
901                 if (old_state < TCP_CONNTRACK_TIME_WAIT)
902                         break;
903                 /* RFC 1122: "When a connection is closed actively,
904                  * it MUST linger in TIME-WAIT state for a time 2xMSL
905                  * (Maximum Segment Lifetime). However, it MAY accept
906                  * a new SYN from the remote TCP to reopen the connection
907                  * directly from TIME-WAIT state, if..."
908                  * We ignore the conditions because we are in the
909                  * TIME-WAIT state anyway.
910                  *
911                  * Handle aborted connections: we and the server
912                  * think there is an existing connection but the client
913                  * aborts it and starts a new one.
914                  */
915                 if (((ct->proto.tcp.seen[dir].flags
916                       | ct->proto.tcp.seen[!dir].flags)
917                      & IP_CT_TCP_FLAG_CLOSE_INIT)
918                     || (ct->proto.tcp.last_dir == dir
919                         && ct->proto.tcp.last_index == TCP_RST_SET)) {
920                         /* Attempt to reopen a closed/aborted connection.
921                          * Delete this connection and look up again. */
922                         spin_unlock_bh(&ct->lock);
923
924                         /* Only repeat if we can actually remove the timer.
925                          * Destruction may already be in progress in process
926                          * context and we must give it a chance to terminate.
927                          */
928                         if (nf_ct_kill(ct))
929                                 return -NF_REPEAT;
930                         return NF_DROP;
931                 }
932                 fallthrough;
933         case TCP_CONNTRACK_IGNORE:
934                 /* Ignored packets:
935                  *
936                  * Our connection entry may be out of sync, so ignore
937                  * packets which may signal the real connection between
938                  * the client and the server.
939                  *
940                  * a) SYN in ORIGINAL
941                  * b) SYN/ACK in REPLY
942                  * c) ACK in reply direction after initial SYN in original.
943                  *
944                  * If the ignored packet is invalid, the receiver will send
945                  * a RST we'll catch below.
946                  */
947                 if (index == TCP_SYNACK_SET
948                     && ct->proto.tcp.last_index == TCP_SYN_SET
949                     && ct->proto.tcp.last_dir != dir
950                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
951                         /* b) This SYN/ACK acknowledges a SYN that we earlier
952                          * ignored as invalid. This means that the client and
953                          * the server are both in sync, while the firewall is
954                          * not. We get in sync from the previously annotated
955                          * values.
956                          */
957                         old_state = TCP_CONNTRACK_SYN_SENT;
958                         new_state = TCP_CONNTRACK_SYN_RECV;
959                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
960                                 ct->proto.tcp.last_end;
961                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
962                                 ct->proto.tcp.last_end;
963                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
964                                 ct->proto.tcp.last_win == 0 ?
965                                         1 : ct->proto.tcp.last_win;
966                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
967                                 ct->proto.tcp.last_wscale;
968                         ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
969                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
970                                 ct->proto.tcp.last_flags;
971                         memset(&ct->proto.tcp.seen[dir], 0,
972                                sizeof(struct ip_ct_tcp_state));
973                         break;
974                 }
975                 ct->proto.tcp.last_index = index;
976                 ct->proto.tcp.last_dir = dir;
977                 ct->proto.tcp.last_seq = ntohl(th->seq);
978                 ct->proto.tcp.last_end =
979                     segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
980                 ct->proto.tcp.last_win = ntohs(th->window);
981
982                 /* a) This is a SYN in ORIGINAL. The client and the server
983                  * may be in sync but we are not. In that case, we annotate
984                  * the TCP options and let the packet go through. If it is a
985                  * valid SYN packet, the server will reply with a SYN/ACK, and
986                  * then we'll get in sync. Otherwise, the server potentially
987                  * responds with a challenge ACK if implementing RFC5961.
988                  */
989                 if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
990                         struct ip_ct_tcp_state seen = {};
991
992                         ct->proto.tcp.last_flags =
993                         ct->proto.tcp.last_wscale = 0;
994                         tcp_options(skb, dataoff, th, &seen);
995                         if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
996                                 ct->proto.tcp.last_flags |=
997                                         IP_CT_TCP_FLAG_WINDOW_SCALE;
998                                 ct->proto.tcp.last_wscale = seen.td_scale;
999                         }
1000                         if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
1001                                 ct->proto.tcp.last_flags |=
1002                                         IP_CT_TCP_FLAG_SACK_PERM;
1003                         }
1004                         /* Mark the potential for RFC5961 challenge ACK,
1005                          * this pose a special problem for LAST_ACK state
1006                          * as ACK is intrepretated as ACKing last FIN.
1007                          */
1008                         if (old_state == TCP_CONNTRACK_LAST_ACK)
1009                                 ct->proto.tcp.last_flags |=
1010                                         IP_CT_EXP_CHALLENGE_ACK;
1011                 }
1012                 spin_unlock_bh(&ct->lock);
1013                 nf_ct_l4proto_log_invalid(skb, ct, state,
1014                                           "packet (index %d) in dir %d ignored, state %s",
1015                                           index, dir,
1016                                           tcp_conntrack_names[old_state]);
1017                 return NF_ACCEPT;
1018         case TCP_CONNTRACK_MAX:
1019                 /* Special case for SYN proxy: when the SYN to the server or
1020                  * the SYN/ACK from the server is lost, the client may transmit
1021                  * a keep-alive packet while in SYN_SENT state. This needs to
1022                  * be associated with the original conntrack entry in order to
1023                  * generate a new SYN with the correct sequence number.
1024                  */
1025                 if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
1026                     index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
1027                     ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
1028                     ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
1029                         pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
1030                         spin_unlock_bh(&ct->lock);
1031                         return NF_ACCEPT;
1032                 }
1033
1034                 /* Invalid packet */
1035                 pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
1036                          dir, get_conntrack_index(th), old_state);
1037                 spin_unlock_bh(&ct->lock);
1038                 nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state");
1039                 return -NF_ACCEPT;
1040         case TCP_CONNTRACK_TIME_WAIT:
1041                 /* RFC5961 compliance cause stack to send "challenge-ACK"
1042                  * e.g. in response to spurious SYNs.  Conntrack MUST
1043                  * not believe this ACK is acking last FIN.
1044                  */
1045                 if (old_state == TCP_CONNTRACK_LAST_ACK &&
1046                     index == TCP_ACK_SET &&
1047                     ct->proto.tcp.last_dir != dir &&
1048                     ct->proto.tcp.last_index == TCP_SYN_SET &&
1049                     (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
1050                         /* Detected RFC5961 challenge ACK */
1051                         ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
1052                         spin_unlock_bh(&ct->lock);
1053                         nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
1054                         return NF_ACCEPT; /* Don't change state */
1055                 }
1056                 break;
1057         case TCP_CONNTRACK_SYN_SENT2:
1058                 /* tcp_conntracks table is not smart enough to handle
1059                  * simultaneous open.
1060                  */
1061                 ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
1062                 break;
1063         case TCP_CONNTRACK_SYN_RECV:
1064                 if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
1065                     ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
1066                         new_state = TCP_CONNTRACK_ESTABLISHED;
1067                 break;
1068         case TCP_CONNTRACK_CLOSE:
1069                 if (index != TCP_RST_SET)
1070                         break;
1071
1072                 /* If we are closing, tuple might have been re-used already.
1073                  * last_index, last_ack, and all other ct fields used for
1074                  * sequence/window validation are outdated in that case.
1075                  *
1076                  * As the conntrack can already be expired by GC under pressure,
1077                  * just skip validation checks.
1078                  */
1079                 if (tcp_can_early_drop(ct))
1080                         goto in_window;
1081
1082                 /* td_maxack might be outdated if we let a SYN through earlier */
1083                 if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
1084                     ct->proto.tcp.last_index != TCP_SYN_SET) {
1085                         u32 seq = ntohl(th->seq);
1086
1087                         /* If we are not in established state and SEQ=0 this is most
1088                          * likely an answer to a SYN we let go through above (last_index
1089                          * can be updated due to out-of-order ACKs).
1090                          */
1091                         if (seq == 0 && !nf_conntrack_tcp_established(ct))
1092                                 break;
1093
1094                         if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
1095                             !tn->tcp_ignore_invalid_rst) {
1096                                 /* Invalid RST  */
1097                                 spin_unlock_bh(&ct->lock);
1098                                 nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
1099                                 return -NF_ACCEPT;
1100                         }
1101
1102                         if (!nf_conntrack_tcp_established(ct) ||
1103                             seq == ct->proto.tcp.seen[!dir].td_maxack)
1104                                 break;
1105
1106                         /* Check if rst is part of train, such as
1107                          *   foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
1108                          *   foo:80 > bar:4379: R, 235946602:235946602(0)  ack 42
1109                          */
1110                         if (ct->proto.tcp.last_index == TCP_ACK_SET &&
1111                             ct->proto.tcp.last_dir == dir &&
1112                             seq == ct->proto.tcp.last_end)
1113                                 break;
1114
1115                         /* ... RST sequence number doesn't match exactly, keep
1116                          * established state to allow a possible challenge ACK.
1117                          */
1118                         new_state = old_state;
1119                 }
1120                 if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
1121                          && ct->proto.tcp.last_index == TCP_SYN_SET)
1122                         || (!test_bit(IPS_ASSURED_BIT, &ct->status)
1123                             && ct->proto.tcp.last_index == TCP_ACK_SET))
1124                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
1125                         /* RST sent to invalid SYN or ACK we had let through
1126                          * at a) and c) above:
1127                          *
1128                          * a) SYN was in window then
1129                          * c) we hold a half-open connection.
1130                          *
1131                          * Delete our connection entry.
1132                          * We skip window checking, because packet might ACK
1133                          * segments we ignored. */
1134                         goto in_window;
1135                 }
1136                 break;
1137         default:
1138                 /* Keep compilers happy. */
1139                 break;
1140         }
1141
1142         if (!tcp_in_window(ct, dir, index,
1143                            skb, dataoff, th, state)) {
1144                 spin_unlock_bh(&ct->lock);
1145                 return -NF_ACCEPT;
1146         }
1147      in_window:
1148         /* From now on we have got in-window packets */
1149         ct->proto.tcp.last_index = index;
1150         ct->proto.tcp.last_dir = dir;
1151
1152         pr_debug("tcp_conntracks: ");
1153         nf_ct_dump_tuple(tuple);
1154         pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1155                  (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1156                  (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1157                  old_state, new_state);
1158
1159         ct->proto.tcp.state = new_state;
1160         if (old_state != new_state
1161             && new_state == TCP_CONNTRACK_FIN_WAIT)
1162                 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1163
1164         timeouts = nf_ct_timeout_lookup(ct);
1165         if (!timeouts)
1166                 timeouts = tn->timeouts;
1167
1168         if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1169             timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1170                 timeout = timeouts[TCP_CONNTRACK_RETRANS];
1171         else if (unlikely(index == TCP_RST_SET))
1172                 timeout = timeouts[TCP_CONNTRACK_CLOSE];
1173         else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1174                  IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
1175                  timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1176                 timeout = timeouts[TCP_CONNTRACK_UNACK];
1177         else if (ct->proto.tcp.last_win == 0 &&
1178                  timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1179                 timeout = timeouts[TCP_CONNTRACK_RETRANS];
1180         else
1181                 timeout = timeouts[new_state];
1182         spin_unlock_bh(&ct->lock);
1183
1184         if (new_state != old_state)
1185                 nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1186
1187         if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1188                 /* If only reply is a RST, we can consider ourselves not to
1189                    have an established connection: this is a fairly common
1190                    problem case, so we can delete the conntrack
1191                    immediately.  --RR */
1192                 if (th->rst) {
1193                         nf_ct_kill_acct(ct, ctinfo, skb);
1194                         return NF_ACCEPT;
1195                 }
1196
1197                 if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
1198                         /* do not renew timeout on SYN retransmit.
1199                          *
1200                          * Else port reuse by client or NAT middlebox can keep
1201                          * entry alive indefinitely (including nat info).
1202                          */
1203                         return NF_ACCEPT;
1204                 }
1205
1206                 /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1207                  * pickup with loose=1. Avoid large ESTABLISHED timeout.
1208                  */
1209                 if (new_state == TCP_CONNTRACK_ESTABLISHED &&
1210                     timeout > timeouts[TCP_CONNTRACK_UNACK])
1211                         timeout = timeouts[TCP_CONNTRACK_UNACK];
1212         } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1213                    && (old_state == TCP_CONNTRACK_SYN_RECV
1214                        || old_state == TCP_CONNTRACK_ESTABLISHED)
1215                    && new_state == TCP_CONNTRACK_ESTABLISHED) {
1216                 /* Set ASSURED if we see valid ack in ESTABLISHED
1217                    after SYN_RECV or a valid answer for a picked up
1218                    connection. */
1219                 set_bit(IPS_ASSURED_BIT, &ct->status);
1220                 nf_conntrack_event_cache(IPCT_ASSURED, ct);
1221         }
1222         nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1223
1224         return NF_ACCEPT;
1225 }
1226
1227 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1228
1229 #include <linux/netfilter/nfnetlink.h>
1230 #include <linux/netfilter/nfnetlink_conntrack.h>
1231
1232 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1233                          struct nf_conn *ct, bool destroy)
1234 {
1235         struct nlattr *nest_parms;
1236         struct nf_ct_tcp_flags tmp = {};
1237
1238         spin_lock_bh(&ct->lock);
1239         nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
1240         if (!nest_parms)
1241                 goto nla_put_failure;
1242
1243         if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
1244                 goto nla_put_failure;
1245
1246         if (destroy)
1247                 goto skip_state;
1248
1249         if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1250                        ct->proto.tcp.seen[0].td_scale) ||
1251             nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1252                        ct->proto.tcp.seen[1].td_scale))
1253                 goto nla_put_failure;
1254
1255         tmp.flags = ct->proto.tcp.seen[0].flags;
1256         if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1257                     sizeof(struct nf_ct_tcp_flags), &tmp))
1258                 goto nla_put_failure;
1259
1260         tmp.flags = ct->proto.tcp.seen[1].flags;
1261         if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1262                     sizeof(struct nf_ct_tcp_flags), &tmp))
1263                 goto nla_put_failure;
1264 skip_state:
1265         spin_unlock_bh(&ct->lock);
1266         nla_nest_end(skb, nest_parms);
1267
1268         return 0;
1269
1270 nla_put_failure:
1271         spin_unlock_bh(&ct->lock);
1272         return -1;
1273 }
1274
1275 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1276         [CTA_PROTOINFO_TCP_STATE]           = { .type = NLA_U8 },
1277         [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1278         [CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1279         [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1280         [CTA_PROTOINFO_TCP_FLAGS_REPLY]     = { .len = sizeof(struct nf_ct_tcp_flags) },
1281 };
1282
1283 #define TCP_NLATTR_SIZE ( \
1284         NLA_ALIGN(NLA_HDRLEN + 1) + \
1285         NLA_ALIGN(NLA_HDRLEN + 1) + \
1286         NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
1287         NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
1288
1289 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1290 {
1291         struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1292         struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1293         int err;
1294
1295         /* updates could not contain anything about the private
1296          * protocol info, in that case skip the parsing */
1297         if (!pattr)
1298                 return 0;
1299
1300         err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
1301                                           tcp_nla_policy, NULL);
1302         if (err < 0)
1303                 return err;
1304
1305         if (tb[CTA_PROTOINFO_TCP_STATE] &&
1306             nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1307                 return -EINVAL;
1308
1309         spin_lock_bh(&ct->lock);
1310         if (tb[CTA_PROTOINFO_TCP_STATE])
1311                 ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1312
1313         if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1314                 struct nf_ct_tcp_flags *attr =
1315                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1316                 ct->proto.tcp.seen[0].flags &= ~attr->mask;
1317                 ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1318         }
1319
1320         if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1321                 struct nf_ct_tcp_flags *attr =
1322                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1323                 ct->proto.tcp.seen[1].flags &= ~attr->mask;
1324                 ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1325         }
1326
1327         if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1328             tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1329             ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1330             ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1331                 ct->proto.tcp.seen[0].td_scale =
1332                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1333                 ct->proto.tcp.seen[1].td_scale =
1334                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1335         }
1336         spin_unlock_bh(&ct->lock);
1337
1338         return 0;
1339 }
1340
1341 static unsigned int tcp_nlattr_tuple_size(void)
1342 {
1343         static unsigned int size __read_mostly;
1344
1345         if (!size)
1346                 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1347
1348         return size;
1349 }
1350 #endif
1351
1352 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1353
1354 #include <linux/netfilter/nfnetlink.h>
1355 #include <linux/netfilter/nfnetlink_cttimeout.h>
1356
1357 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1358                                      struct net *net, void *data)
1359 {
1360         struct nf_tcp_net *tn = nf_tcp_pernet(net);
1361         unsigned int *timeouts = data;
1362         int i;
1363
1364         if (!timeouts)
1365                 timeouts = tn->timeouts;
1366         /* set default TCP timeouts. */
1367         for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1368                 timeouts[i] = tn->timeouts[i];
1369
1370         if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
1371                 timeouts[TCP_CONNTRACK_SYN_SENT] =
1372                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1373         }
1374
1375         if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1376                 timeouts[TCP_CONNTRACK_SYN_RECV] =
1377                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1378         }
1379         if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1380                 timeouts[TCP_CONNTRACK_ESTABLISHED] =
1381                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1382         }
1383         if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1384                 timeouts[TCP_CONNTRACK_FIN_WAIT] =
1385                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1386         }
1387         if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1388                 timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1389                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1390         }
1391         if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1392                 timeouts[TCP_CONNTRACK_LAST_ACK] =
1393                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1394         }
1395         if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1396                 timeouts[TCP_CONNTRACK_TIME_WAIT] =
1397                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1398         }
1399         if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1400                 timeouts[TCP_CONNTRACK_CLOSE] =
1401                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1402         }
1403         if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1404                 timeouts[TCP_CONNTRACK_SYN_SENT2] =
1405                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1406         }
1407         if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1408                 timeouts[TCP_CONNTRACK_RETRANS] =
1409                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1410         }
1411         if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1412                 timeouts[TCP_CONNTRACK_UNACK] =
1413                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1414         }
1415
1416         timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
1417         return 0;
1418 }
1419
1420 static int
1421 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1422 {
1423         const unsigned int *timeouts = data;
1424
1425         if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1426                         htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1427             nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1428                          htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1429             nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1430                          htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1431             nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1432                          htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1433             nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1434                          htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1435             nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1436                          htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1437             nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1438                          htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1439             nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1440                          htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1441             nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1442                          htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1443             nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1444                          htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1445             nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1446                          htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1447                 goto nla_put_failure;
1448         return 0;
1449
1450 nla_put_failure:
1451         return -ENOSPC;
1452 }
1453
1454 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1455         [CTA_TIMEOUT_TCP_SYN_SENT]      = { .type = NLA_U32 },
1456         [CTA_TIMEOUT_TCP_SYN_RECV]      = { .type = NLA_U32 },
1457         [CTA_TIMEOUT_TCP_ESTABLISHED]   = { .type = NLA_U32 },
1458         [CTA_TIMEOUT_TCP_FIN_WAIT]      = { .type = NLA_U32 },
1459         [CTA_TIMEOUT_TCP_CLOSE_WAIT]    = { .type = NLA_U32 },
1460         [CTA_TIMEOUT_TCP_LAST_ACK]      = { .type = NLA_U32 },
1461         [CTA_TIMEOUT_TCP_TIME_WAIT]     = { .type = NLA_U32 },
1462         [CTA_TIMEOUT_TCP_CLOSE]         = { .type = NLA_U32 },
1463         [CTA_TIMEOUT_TCP_SYN_SENT2]     = { .type = NLA_U32 },
1464         [CTA_TIMEOUT_TCP_RETRANS]       = { .type = NLA_U32 },
1465         [CTA_TIMEOUT_TCP_UNACK]         = { .type = NLA_U32 },
1466 };
1467 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1468
1469 void nf_conntrack_tcp_init_net(struct net *net)
1470 {
1471         struct nf_tcp_net *tn = nf_tcp_pernet(net);
1472         int i;
1473
1474         for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1475                 tn->timeouts[i] = tcp_timeouts[i];
1476
1477         /* timeouts[0] is unused, make it same as SYN_SENT so
1478          * ->timeouts[0] contains 'new' timeout, like udp or icmp.
1479          */
1480         tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
1481
1482         /* If it is set to zero, we disable picking up already established
1483          * connections.
1484          */
1485         tn->tcp_loose = 1;
1486
1487         /* "Be conservative in what you do,
1488          *  be liberal in what you accept from others."
1489          * If it's non-zero, we mark only out of window RST segments as INVALID.
1490          */
1491         tn->tcp_be_liberal = 0;
1492
1493         /* If it's non-zero, we turn off RST sequence number check */
1494         tn->tcp_ignore_invalid_rst = 0;
1495
1496         /* Max number of the retransmitted packets without receiving an (acceptable)
1497          * ACK from the destination. If this number is reached, a shorter timer
1498          * will be started.
1499          */
1500         tn->tcp_max_retrans = 3;
1501
1502 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
1503         tn->offload_timeout = 30 * HZ;
1504 #endif
1505 }
1506
1507 const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1508 {
1509         .l4proto                = IPPROTO_TCP,
1510 #ifdef CONFIG_NF_CONNTRACK_PROCFS
1511         .print_conntrack        = tcp_print_conntrack,
1512 #endif
1513         .can_early_drop         = tcp_can_early_drop,
1514 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1515         .to_nlattr              = tcp_to_nlattr,
1516         .from_nlattr            = nlattr_to_tcp,
1517         .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
1518         .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
1519         .nlattr_tuple_size      = tcp_nlattr_tuple_size,
1520         .nlattr_size            = TCP_NLATTR_SIZE,
1521         .nla_policy             = nf_ct_port_nla_policy,
1522 #endif
1523 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1524         .ctnl_timeout           = {
1525                 .nlattr_to_obj  = tcp_timeout_nlattr_to_obj,
1526                 .obj_to_nlattr  = tcp_timeout_obj_to_nlattr,
1527                 .nlattr_max     = CTA_TIMEOUT_TCP_MAX,
1528                 .obj_size       = sizeof(unsigned int) *
1529                                         TCP_CONNTRACK_TIMEOUT_MAX,
1530                 .nla_policy     = tcp_timeout_nla_policy,
1531         },
1532 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1533 };