Merge tag 'iommu-updates-v5.13' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / net / netfilter / nf_conntrack_proto_tcp.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* (C) 1999-2001 Paul `Rusty' Russell
3  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4  * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
5  * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
6  */
7
8 #include <linux/types.h>
9 #include <linux/timer.h>
10 #include <linux/module.h>
11 #include <linux/in.h>
12 #include <linux/tcp.h>
13 #include <linux/spinlock.h>
14 #include <linux/skbuff.h>
15 #include <linux/ipv6.h>
16 #include <net/ip6_checksum.h>
17 #include <asm/unaligned.h>
18
19 #include <net/tcp.h>
20
21 #include <linux/netfilter.h>
22 #include <linux/netfilter_ipv4.h>
23 #include <linux/netfilter_ipv6.h>
24 #include <net/netfilter/nf_conntrack.h>
25 #include <net/netfilter/nf_conntrack_l4proto.h>
26 #include <net/netfilter/nf_conntrack_ecache.h>
27 #include <net/netfilter/nf_conntrack_seqadj.h>
28 #include <net/netfilter/nf_conntrack_synproxy.h>
29 #include <net/netfilter/nf_conntrack_timeout.h>
30 #include <net/netfilter/nf_log.h>
31 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
32 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
33
34   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
35      closely.  They're more complex. --RR */
36
37 static const char *const tcp_conntrack_names[] = {
38         "NONE",
39         "SYN_SENT",
40         "SYN_RECV",
41         "ESTABLISHED",
42         "FIN_WAIT",
43         "CLOSE_WAIT",
44         "LAST_ACK",
45         "TIME_WAIT",
46         "CLOSE",
47         "SYN_SENT2",
48 };
49
50 #define SECS * HZ
51 #define MINS * 60 SECS
52 #define HOURS * 60 MINS
53 #define DAYS * 24 HOURS
54
55 static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
56         [TCP_CONNTRACK_SYN_SENT]        = 2 MINS,
57         [TCP_CONNTRACK_SYN_RECV]        = 60 SECS,
58         [TCP_CONNTRACK_ESTABLISHED]     = 5 DAYS,
59         [TCP_CONNTRACK_FIN_WAIT]        = 2 MINS,
60         [TCP_CONNTRACK_CLOSE_WAIT]      = 60 SECS,
61         [TCP_CONNTRACK_LAST_ACK]        = 30 SECS,
62         [TCP_CONNTRACK_TIME_WAIT]       = 2 MINS,
63         [TCP_CONNTRACK_CLOSE]           = 10 SECS,
64         [TCP_CONNTRACK_SYN_SENT2]       = 2 MINS,
65 /* RFC1122 says the R2 limit should be at least 100 seconds.
66    Linux uses 15 packets as limit, which corresponds
67    to ~13-30min depending on RTO. */
68         [TCP_CONNTRACK_RETRANS]         = 5 MINS,
69         [TCP_CONNTRACK_UNACK]           = 5 MINS,
70 };
71
72 #define sNO TCP_CONNTRACK_NONE
73 #define sSS TCP_CONNTRACK_SYN_SENT
74 #define sSR TCP_CONNTRACK_SYN_RECV
75 #define sES TCP_CONNTRACK_ESTABLISHED
76 #define sFW TCP_CONNTRACK_FIN_WAIT
77 #define sCW TCP_CONNTRACK_CLOSE_WAIT
78 #define sLA TCP_CONNTRACK_LAST_ACK
79 #define sTW TCP_CONNTRACK_TIME_WAIT
80 #define sCL TCP_CONNTRACK_CLOSE
81 #define sS2 TCP_CONNTRACK_SYN_SENT2
82 #define sIV TCP_CONNTRACK_MAX
83 #define sIG TCP_CONNTRACK_IGNORE
84
85 /* What TCP flags are set from RST/SYN/FIN/ACK. */
86 enum tcp_bit_set {
87         TCP_SYN_SET,
88         TCP_SYNACK_SET,
89         TCP_FIN_SET,
90         TCP_ACK_SET,
91         TCP_RST_SET,
92         TCP_NONE_SET,
93 };
94
95 /*
96  * The TCP state transition table needs a few words...
97  *
98  * We are the man in the middle. All the packets go through us
99  * but might get lost in transit to the destination.
100  * It is assumed that the destinations can't receive segments
101  * we haven't seen.
102  *
103  * The checked segment is in window, but our windows are *not*
104  * equivalent with the ones of the sender/receiver. We always
105  * try to guess the state of the current sender.
106  *
107  * The meaning of the states are:
108  *
109  * NONE:        initial state
110  * SYN_SENT:    SYN-only packet seen
111  * SYN_SENT2:   SYN-only packet seen from reply dir, simultaneous open
112  * SYN_RECV:    SYN-ACK packet seen
113  * ESTABLISHED: ACK packet seen
114  * FIN_WAIT:    FIN packet seen
115  * CLOSE_WAIT:  ACK seen (after FIN)
116  * LAST_ACK:    FIN seen (after FIN)
117  * TIME_WAIT:   last ACK seen
118  * CLOSE:       closed connection (RST)
119  *
120  * Packets marked as IGNORED (sIG):
121  *      if they may be either invalid or valid
122  *      and the receiver may send back a connection
123  *      closing RST or a SYN/ACK.
124  *
125  * Packets marked as INVALID (sIV):
126  *      if we regard them as truly invalid packets
127  */
128 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
129         {
130 /* ORIGINAL */
131 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
132 /*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
133 /*
134  *      sNO -> sSS      Initialize a new connection
135  *      sSS -> sSS      Retransmitted SYN
136  *      sS2 -> sS2      Late retransmitted SYN
137  *      sSR -> sIG
138  *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
139  *                      are errors. Receiver will reply with RST
140  *                      and close the connection.
141  *                      Or we are not in sync and hold a dead connection.
142  *      sFW -> sIG
143  *      sCW -> sIG
144  *      sLA -> sIG
145  *      sTW -> sSS      Reopened connection (RFC 1122).
146  *      sCL -> sSS
147  */
148 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
149 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
150 /*
151  *      sNO -> sIV      Too late and no reason to do anything
152  *      sSS -> sIV      Client can't send SYN and then SYN/ACK
153  *      sS2 -> sSR      SYN/ACK sent to SYN2 in simultaneous open
154  *      sSR -> sSR      Late retransmitted SYN/ACK in simultaneous open
155  *      sES -> sIV      Invalid SYN/ACK packets sent by the client
156  *      sFW -> sIV
157  *      sCW -> sIV
158  *      sLA -> sIV
159  *      sTW -> sIV
160  *      sCL -> sIV
161  */
162 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
163 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
164 /*
165  *      sNO -> sIV      Too late and no reason to do anything...
166  *      sSS -> sIV      Client migth not send FIN in this state:
167  *                      we enforce waiting for a SYN/ACK reply first.
168  *      sS2 -> sIV
169  *      sSR -> sFW      Close started.
170  *      sES -> sFW
171  *      sFW -> sLA      FIN seen in both directions, waiting for
172  *                      the last ACK.
173  *                      Migth be a retransmitted FIN as well...
174  *      sCW -> sLA
175  *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
176  *      sTW -> sTW
177  *      sCL -> sCL
178  */
179 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
180 /*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
181 /*
182  *      sNO -> sES      Assumed.
183  *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
184  *      sS2 -> sIV
185  *      sSR -> sES      Established state is reached.
186  *      sES -> sES      :-)
187  *      sFW -> sCW      Normal close request answered by ACK.
188  *      sCW -> sCW
189  *      sLA -> sTW      Last ACK detected (RFC5961 challenged)
190  *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
191  *      sCL -> sCL
192  */
193 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
194 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
195 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
196         },
197         {
198 /* REPLY */
199 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
200 /*syn*/    { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
201 /*
202  *      sNO -> sIV      Never reached.
203  *      sSS -> sS2      Simultaneous open
204  *      sS2 -> sS2      Retransmitted simultaneous SYN
205  *      sSR -> sIV      Invalid SYN packets sent by the server
206  *      sES -> sIV
207  *      sFW -> sIV
208  *      sCW -> sIV
209  *      sLA -> sIV
210  *      sTW -> sSS      Reopened connection, but server may have switched role
211  *      sCL -> sIV
212  */
213 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
214 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
215 /*
216  *      sSS -> sSR      Standard open.
217  *      sS2 -> sSR      Simultaneous open
218  *      sSR -> sIG      Retransmitted SYN/ACK, ignore it.
219  *      sES -> sIG      Late retransmitted SYN/ACK?
220  *      sFW -> sIG      Might be SYN/ACK answering ignored SYN
221  *      sCW -> sIG
222  *      sLA -> sIG
223  *      sTW -> sIG
224  *      sCL -> sIG
225  */
226 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
227 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
228 /*
229  *      sSS -> sIV      Server might not send FIN in this state.
230  *      sS2 -> sIV
231  *      sSR -> sFW      Close started.
232  *      sES -> sFW
233  *      sFW -> sLA      FIN seen in both directions.
234  *      sCW -> sLA
235  *      sLA -> sLA      Retransmitted FIN.
236  *      sTW -> sTW
237  *      sCL -> sCL
238  */
239 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
240 /*ack*/    { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
241 /*
242  *      sSS -> sIG      Might be a half-open connection.
243  *      sS2 -> sIG
244  *      sSR -> sSR      Might answer late resent SYN.
245  *      sES -> sES      :-)
246  *      sFW -> sCW      Normal close request answered by ACK.
247  *      sCW -> sCW
248  *      sLA -> sTW      Last ACK detected (RFC5961 challenged)
249  *      sTW -> sTW      Retransmitted last ACK.
250  *      sCL -> sCL
251  */
252 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
253 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
254 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
255         }
256 };
257
258 #ifdef CONFIG_NF_CONNTRACK_PROCFS
259 /* Print out the private part of the conntrack. */
260 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
261 {
262         if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
263                 return;
264
265         seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
266 }
267 #endif
268
269 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
270 {
271         if (tcph->rst) return TCP_RST_SET;
272         else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
273         else if (tcph->fin) return TCP_FIN_SET;
274         else if (tcph->ack) return TCP_ACK_SET;
275         else return TCP_NONE_SET;
276 }
277
278 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
279    in IP Filter' by Guido van Rooij.
280
281    http://www.sane.nl/events/sane2000/papers.html
282    http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
283
284    The boundaries and the conditions are changed according to RFC793:
285    the packet must intersect the window (i.e. segments may be
286    after the right or before the left edge) and thus receivers may ACK
287    segments after the right edge of the window.
288
289         td_maxend = max(sack + max(win,1)) seen in reply packets
290         td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
291         td_maxwin += seq + len - sender.td_maxend
292                         if seq + len > sender.td_maxend
293         td_end    = max(seq + len) seen in sent packets
294
295    I.   Upper bound for valid data:     seq <= sender.td_maxend
296    II.  Lower bound for valid data:     seq + len >= sender.td_end - receiver.td_maxwin
297    III. Upper bound for valid (s)ack:   sack <= receiver.td_end
298    IV.  Lower bound for valid (s)ack:   sack >= receiver.td_end - MAXACKWINDOW
299
300    where sack is the highest right edge of sack block found in the packet
301    or ack in the case of packet without SACK option.
302
303    The upper bound limit for a valid (s)ack is not ignored -
304    we doesn't have to deal with fragments.
305 */
306
307 static inline __u32 segment_seq_plus_len(__u32 seq,
308                                          size_t len,
309                                          unsigned int dataoff,
310                                          const struct tcphdr *tcph)
311 {
312         /* XXX Should I use payload length field in IP/IPv6 header ?
313          * - YK */
314         return (seq + len - dataoff - tcph->doff*4
315                 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
316 }
317
318 /* Fixme: what about big packets? */
319 #define MAXACKWINCONST                  66000
320 #define MAXACKWINDOW(sender)                                            \
321         ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
322                                               : MAXACKWINCONST)
323
324 /*
325  * Simplified tcp_parse_options routine from tcp_input.c
326  */
327 static void tcp_options(const struct sk_buff *skb,
328                         unsigned int dataoff,
329                         const struct tcphdr *tcph,
330                         struct ip_ct_tcp_state *state)
331 {
332         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
333         const unsigned char *ptr;
334         int length = (tcph->doff*4) - sizeof(struct tcphdr);
335
336         if (!length)
337                 return;
338
339         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
340                                  length, buff);
341         BUG_ON(ptr == NULL);
342
343         state->td_scale =
344         state->flags = 0;
345
346         while (length > 0) {
347                 int opcode=*ptr++;
348                 int opsize;
349
350                 switch (opcode) {
351                 case TCPOPT_EOL:
352                         return;
353                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
354                         length--;
355                         continue;
356                 default:
357                         if (length < 2)
358                                 return;
359                         opsize=*ptr++;
360                         if (opsize < 2) /* "silly options" */
361                                 return;
362                         if (opsize > length)
363                                 return; /* don't parse partial options */
364
365                         if (opcode == TCPOPT_SACK_PERM
366                             && opsize == TCPOLEN_SACK_PERM)
367                                 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
368                         else if (opcode == TCPOPT_WINDOW
369                                  && opsize == TCPOLEN_WINDOW) {
370                                 state->td_scale = *(u_int8_t *)ptr;
371
372                                 if (state->td_scale > TCP_MAX_WSCALE)
373                                         state->td_scale = TCP_MAX_WSCALE;
374
375                                 state->flags |=
376                                         IP_CT_TCP_FLAG_WINDOW_SCALE;
377                         }
378                         ptr += opsize - 2;
379                         length -= opsize;
380                 }
381         }
382 }
383
384 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
385                      const struct tcphdr *tcph, __u32 *sack)
386 {
387         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
388         const unsigned char *ptr;
389         int length = (tcph->doff*4) - sizeof(struct tcphdr);
390         __u32 tmp;
391
392         if (!length)
393                 return;
394
395         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
396                                  length, buff);
397         BUG_ON(ptr == NULL);
398
399         /* Fast path for timestamp-only option */
400         if (length == TCPOLEN_TSTAMP_ALIGNED
401             && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
402                                        | (TCPOPT_NOP << 16)
403                                        | (TCPOPT_TIMESTAMP << 8)
404                                        | TCPOLEN_TIMESTAMP))
405                 return;
406
407         while (length > 0) {
408                 int opcode = *ptr++;
409                 int opsize, i;
410
411                 switch (opcode) {
412                 case TCPOPT_EOL:
413                         return;
414                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
415                         length--;
416                         continue;
417                 default:
418                         if (length < 2)
419                                 return;
420                         opsize = *ptr++;
421                         if (opsize < 2) /* "silly options" */
422                                 return;
423                         if (opsize > length)
424                                 return; /* don't parse partial options */
425
426                         if (opcode == TCPOPT_SACK
427                             && opsize >= (TCPOLEN_SACK_BASE
428                                           + TCPOLEN_SACK_PERBLOCK)
429                             && !((opsize - TCPOLEN_SACK_BASE)
430                                  % TCPOLEN_SACK_PERBLOCK)) {
431                                 for (i = 0;
432                                      i < (opsize - TCPOLEN_SACK_BASE);
433                                      i += TCPOLEN_SACK_PERBLOCK) {
434                                         tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
435
436                                         if (after(tmp, *sack))
437                                                 *sack = tmp;
438                                 }
439                                 return;
440                         }
441                         ptr += opsize - 2;
442                         length -= opsize;
443                 }
444         }
445 }
446
447 static bool tcp_in_window(const struct nf_conn *ct,
448                           struct ip_ct_tcp *state,
449                           enum ip_conntrack_dir dir,
450                           unsigned int index,
451                           const struct sk_buff *skb,
452                           unsigned int dataoff,
453                           const struct tcphdr *tcph)
454 {
455         struct net *net = nf_ct_net(ct);
456         struct nf_tcp_net *tn = nf_tcp_pernet(net);
457         struct ip_ct_tcp_state *sender = &state->seen[dir];
458         struct ip_ct_tcp_state *receiver = &state->seen[!dir];
459         const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
460         __u32 seq, ack, sack, end, win, swin;
461         u16 win_raw;
462         s32 receiver_offset;
463         bool res, in_recv_win;
464
465         /*
466          * Get the required data from the packet.
467          */
468         seq = ntohl(tcph->seq);
469         ack = sack = ntohl(tcph->ack_seq);
470         win_raw = ntohs(tcph->window);
471         win = win_raw;
472         end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
473
474         if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
475                 tcp_sack(skb, dataoff, tcph, &sack);
476
477         /* Take into account NAT sequence number mangling */
478         receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
479         ack -= receiver_offset;
480         sack -= receiver_offset;
481
482         pr_debug("tcp_in_window: START\n");
483         pr_debug("tcp_in_window: ");
484         nf_ct_dump_tuple(tuple);
485         pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
486                  seq, ack, receiver_offset, sack, receiver_offset, win, end);
487         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
488                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
489                  sender->td_end, sender->td_maxend, sender->td_maxwin,
490                  sender->td_scale,
491                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
492                  receiver->td_scale);
493
494         if (sender->td_maxwin == 0) {
495                 /*
496                  * Initialize sender data.
497                  */
498                 if (tcph->syn) {
499                         /*
500                          * SYN-ACK in reply to a SYN
501                          * or SYN from reply direction in simultaneous open.
502                          */
503                         sender->td_end =
504                         sender->td_maxend = end;
505                         sender->td_maxwin = (win == 0 ? 1 : win);
506
507                         tcp_options(skb, dataoff, tcph, sender);
508                         /*
509                          * RFC 1323:
510                          * Both sides must send the Window Scale option
511                          * to enable window scaling in either direction.
512                          */
513                         if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
514                               && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
515                                 sender->td_scale =
516                                 receiver->td_scale = 0;
517                         if (!tcph->ack)
518                                 /* Simultaneous open */
519                                 return true;
520                 } else {
521                         /*
522                          * We are in the middle of a connection,
523                          * its history is lost for us.
524                          * Let's try to use the data from the packet.
525                          */
526                         sender->td_end = end;
527                         swin = win << sender->td_scale;
528                         sender->td_maxwin = (swin == 0 ? 1 : swin);
529                         sender->td_maxend = end + sender->td_maxwin;
530                         if (receiver->td_maxwin == 0) {
531                                 /* We haven't seen traffic in the other
532                                  * direction yet but we have to tweak window
533                                  * tracking to pass III and IV until that
534                                  * happens.
535                                  */
536                                 receiver->td_end = receiver->td_maxend = sack;
537                         } else if (sack == receiver->td_end + 1) {
538                                 /* Likely a reply to a keepalive.
539                                  * Needed for III.
540                                  */
541                                 receiver->td_end++;
542                         }
543
544                 }
545         } else if (((state->state == TCP_CONNTRACK_SYN_SENT
546                      && dir == IP_CT_DIR_ORIGINAL)
547                    || (state->state == TCP_CONNTRACK_SYN_RECV
548                      && dir == IP_CT_DIR_REPLY))
549                    && after(end, sender->td_end)) {
550                 /*
551                  * RFC 793: "if a TCP is reinitialized ... then it need
552                  * not wait at all; it must only be sure to use sequence
553                  * numbers larger than those recently used."
554                  */
555                 sender->td_end =
556                 sender->td_maxend = end;
557                 sender->td_maxwin = (win == 0 ? 1 : win);
558
559                 tcp_options(skb, dataoff, tcph, sender);
560         }
561
562         if (!(tcph->ack)) {
563                 /*
564                  * If there is no ACK, just pretend it was set and OK.
565                  */
566                 ack = sack = receiver->td_end;
567         } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
568                     (TCP_FLAG_ACK|TCP_FLAG_RST))
569                    && (ack == 0)) {
570                 /*
571                  * Broken TCP stacks, that set ACK in RST packets as well
572                  * with zero ack value.
573                  */
574                 ack = sack = receiver->td_end;
575         }
576
577         if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
578                 /*
579                  * RST sent answering SYN.
580                  */
581                 seq = end = sender->td_end;
582
583         pr_debug("tcp_in_window: ");
584         nf_ct_dump_tuple(tuple);
585         pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
586                  seq, ack, receiver_offset, sack, receiver_offset, win, end);
587         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
588                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
589                  sender->td_end, sender->td_maxend, sender->td_maxwin,
590                  sender->td_scale,
591                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
592                  receiver->td_scale);
593
594         /* Is the ending sequence in the receive window (if available)? */
595         in_recv_win = !receiver->td_maxwin ||
596                       after(end, sender->td_end - receiver->td_maxwin - 1);
597
598         pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
599                  before(seq, sender->td_maxend + 1),
600                  (in_recv_win ? 1 : 0),
601                  before(sack, receiver->td_end + 1),
602                  after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
603
604         if (before(seq, sender->td_maxend + 1) &&
605             in_recv_win &&
606             before(sack, receiver->td_end + 1) &&
607             after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
608                 /*
609                  * Take into account window scaling (RFC 1323).
610                  */
611                 if (!tcph->syn)
612                         win <<= sender->td_scale;
613
614                 /*
615                  * Update sender data.
616                  */
617                 swin = win + (sack - ack);
618                 if (sender->td_maxwin < swin)
619                         sender->td_maxwin = swin;
620                 if (after(end, sender->td_end)) {
621                         sender->td_end = end;
622                         sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
623                 }
624                 if (tcph->ack) {
625                         if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
626                                 sender->td_maxack = ack;
627                                 sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
628                         } else if (after(ack, sender->td_maxack))
629                                 sender->td_maxack = ack;
630                 }
631
632                 /*
633                  * Update receiver data.
634                  */
635                 if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
636                         receiver->td_maxwin += end - sender->td_maxend;
637                 if (after(sack + win, receiver->td_maxend - 1)) {
638                         receiver->td_maxend = sack + win;
639                         if (win == 0)
640                                 receiver->td_maxend++;
641                 }
642                 if (ack == receiver->td_end)
643                         receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
644
645                 /*
646                  * Check retransmissions.
647                  */
648                 if (index == TCP_ACK_SET) {
649                         if (state->last_dir == dir
650                             && state->last_seq == seq
651                             && state->last_ack == ack
652                             && state->last_end == end
653                             && state->last_win == win_raw)
654                                 state->retrans++;
655                         else {
656                                 state->last_dir = dir;
657                                 state->last_seq = seq;
658                                 state->last_ack = ack;
659                                 state->last_end = end;
660                                 state->last_win = win_raw;
661                                 state->retrans = 0;
662                         }
663                 }
664                 res = true;
665         } else {
666                 res = false;
667                 if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
668                     tn->tcp_be_liberal)
669                         res = true;
670                 if (!res) {
671                         nf_ct_l4proto_log_invalid(skb, ct,
672                         "%s",
673                         before(seq, sender->td_maxend + 1) ?
674                         in_recv_win ?
675                         before(sack, receiver->td_end + 1) ?
676                         after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
677                         : "ACK is under the lower bound (possible overly delayed ACK)"
678                         : "ACK is over the upper bound (ACKed data not seen yet)"
679                         : "SEQ is under the lower bound (already ACKed data retransmitted)"
680                         : "SEQ is over the upper bound (over the window of the receiver)");
681                 }
682         }
683
684         pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
685                  "receiver end=%u maxend=%u maxwin=%u\n",
686                  res, sender->td_end, sender->td_maxend, sender->td_maxwin,
687                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
688
689         return res;
690 }
691
692 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
693 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
694                                  TCPHDR_URG) + 1] =
695 {
696         [TCPHDR_SYN]                            = 1,
697         [TCPHDR_SYN|TCPHDR_URG]                 = 1,
698         [TCPHDR_SYN|TCPHDR_ACK]                 = 1,
699         [TCPHDR_RST]                            = 1,
700         [TCPHDR_RST|TCPHDR_ACK]                 = 1,
701         [TCPHDR_FIN|TCPHDR_ACK]                 = 1,
702         [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]      = 1,
703         [TCPHDR_ACK]                            = 1,
704         [TCPHDR_ACK|TCPHDR_URG]                 = 1,
705 };
706
707 static void tcp_error_log(const struct sk_buff *skb,
708                           const struct nf_hook_state *state,
709                           const char *msg)
710 {
711         nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
712 }
713
714 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
715 static bool tcp_error(const struct tcphdr *th,
716                       struct sk_buff *skb,
717                       unsigned int dataoff,
718                       const struct nf_hook_state *state)
719 {
720         unsigned int tcplen = skb->len - dataoff;
721         u8 tcpflags;
722
723         /* Not whole TCP header or malformed packet */
724         if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
725                 tcp_error_log(skb, state, "truncated packet");
726                 return true;
727         }
728
729         /* Checksum invalid? Ignore.
730          * We skip checking packets on the outgoing path
731          * because the checksum is assumed to be correct.
732          */
733         /* FIXME: Source route IP option packets --RR */
734         if (state->net->ct.sysctl_checksum &&
735             state->hook == NF_INET_PRE_ROUTING &&
736             nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
737                 tcp_error_log(skb, state, "bad checksum");
738                 return true;
739         }
740
741         /* Check TCP flags. */
742         tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
743         if (!tcp_valid_flags[tcpflags]) {
744                 tcp_error_log(skb, state, "invalid tcp flag combination");
745                 return true;
746         }
747
748         return false;
749 }
750
751 static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
752                              unsigned int dataoff,
753                              const struct tcphdr *th)
754 {
755         enum tcp_conntrack new_state;
756         struct net *net = nf_ct_net(ct);
757         const struct nf_tcp_net *tn = nf_tcp_pernet(net);
758         const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
759         const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
760
761         /* Don't need lock here: this conntrack not in circulation yet */
762         new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
763
764         /* Invalid: delete conntrack */
765         if (new_state >= TCP_CONNTRACK_MAX) {
766                 pr_debug("nf_ct_tcp: invalid new deleting.\n");
767                 return false;
768         }
769
770         if (new_state == TCP_CONNTRACK_SYN_SENT) {
771                 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
772                 /* SYN packet */
773                 ct->proto.tcp.seen[0].td_end =
774                         segment_seq_plus_len(ntohl(th->seq), skb->len,
775                                              dataoff, th);
776                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
777                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
778                         ct->proto.tcp.seen[0].td_maxwin = 1;
779                 ct->proto.tcp.seen[0].td_maxend =
780                         ct->proto.tcp.seen[0].td_end;
781
782                 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
783         } else if (tn->tcp_loose == 0) {
784                 /* Don't try to pick up connections. */
785                 return false;
786         } else {
787                 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
788                 /*
789                  * We are in the middle of a connection,
790                  * its history is lost for us.
791                  * Let's try to use the data from the packet.
792                  */
793                 ct->proto.tcp.seen[0].td_end =
794                         segment_seq_plus_len(ntohl(th->seq), skb->len,
795                                              dataoff, th);
796                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
797                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
798                         ct->proto.tcp.seen[0].td_maxwin = 1;
799                 ct->proto.tcp.seen[0].td_maxend =
800                         ct->proto.tcp.seen[0].td_end +
801                         ct->proto.tcp.seen[0].td_maxwin;
802
803                 /* We assume SACK and liberal window checking to handle
804                  * window scaling */
805                 ct->proto.tcp.seen[0].flags =
806                 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
807                                               IP_CT_TCP_FLAG_BE_LIBERAL;
808         }
809
810         /* tcp_packet will set them */
811         ct->proto.tcp.last_index = TCP_NONE_SET;
812
813         pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
814                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
815                  __func__,
816                  sender->td_end, sender->td_maxend, sender->td_maxwin,
817                  sender->td_scale,
818                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
819                  receiver->td_scale);
820         return true;
821 }
822
823 /* Returns verdict for packet, or -1 for invalid. */
824 int nf_conntrack_tcp_packet(struct nf_conn *ct,
825                             struct sk_buff *skb,
826                             unsigned int dataoff,
827                             enum ip_conntrack_info ctinfo,
828                             const struct nf_hook_state *state)
829 {
830         struct net *net = nf_ct_net(ct);
831         struct nf_tcp_net *tn = nf_tcp_pernet(net);
832         struct nf_conntrack_tuple *tuple;
833         enum tcp_conntrack new_state, old_state;
834         unsigned int index, *timeouts;
835         enum ip_conntrack_dir dir;
836         const struct tcphdr *th;
837         struct tcphdr _tcph;
838         unsigned long timeout;
839
840         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
841         if (th == NULL)
842                 return -NF_ACCEPT;
843
844         if (tcp_error(th, skb, dataoff, state))
845                 return -NF_ACCEPT;
846
847         if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
848                 return -NF_ACCEPT;
849
850         spin_lock_bh(&ct->lock);
851         old_state = ct->proto.tcp.state;
852         dir = CTINFO2DIR(ctinfo);
853         index = get_conntrack_index(th);
854         new_state = tcp_conntracks[dir][index][old_state];
855         tuple = &ct->tuplehash[dir].tuple;
856
857         switch (new_state) {
858         case TCP_CONNTRACK_SYN_SENT:
859                 if (old_state < TCP_CONNTRACK_TIME_WAIT)
860                         break;
861                 /* RFC 1122: "When a connection is closed actively,
862                  * it MUST linger in TIME-WAIT state for a time 2xMSL
863                  * (Maximum Segment Lifetime). However, it MAY accept
864                  * a new SYN from the remote TCP to reopen the connection
865                  * directly from TIME-WAIT state, if..."
866                  * We ignore the conditions because we are in the
867                  * TIME-WAIT state anyway.
868                  *
869                  * Handle aborted connections: we and the server
870                  * think there is an existing connection but the client
871                  * aborts it and starts a new one.
872                  */
873                 if (((ct->proto.tcp.seen[dir].flags
874                       | ct->proto.tcp.seen[!dir].flags)
875                      & IP_CT_TCP_FLAG_CLOSE_INIT)
876                     || (ct->proto.tcp.last_dir == dir
877                         && ct->proto.tcp.last_index == TCP_RST_SET)) {
878                         /* Attempt to reopen a closed/aborted connection.
879                          * Delete this connection and look up again. */
880                         spin_unlock_bh(&ct->lock);
881
882                         /* Only repeat if we can actually remove the timer.
883                          * Destruction may already be in progress in process
884                          * context and we must give it a chance to terminate.
885                          */
886                         if (nf_ct_kill(ct))
887                                 return -NF_REPEAT;
888                         return NF_DROP;
889                 }
890                 fallthrough;
891         case TCP_CONNTRACK_IGNORE:
892                 /* Ignored packets:
893                  *
894                  * Our connection entry may be out of sync, so ignore
895                  * packets which may signal the real connection between
896                  * the client and the server.
897                  *
898                  * a) SYN in ORIGINAL
899                  * b) SYN/ACK in REPLY
900                  * c) ACK in reply direction after initial SYN in original.
901                  *
902                  * If the ignored packet is invalid, the receiver will send
903                  * a RST we'll catch below.
904                  */
905                 if (index == TCP_SYNACK_SET
906                     && ct->proto.tcp.last_index == TCP_SYN_SET
907                     && ct->proto.tcp.last_dir != dir
908                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
909                         /* b) This SYN/ACK acknowledges a SYN that we earlier
910                          * ignored as invalid. This means that the client and
911                          * the server are both in sync, while the firewall is
912                          * not. We get in sync from the previously annotated
913                          * values.
914                          */
915                         old_state = TCP_CONNTRACK_SYN_SENT;
916                         new_state = TCP_CONNTRACK_SYN_RECV;
917                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
918                                 ct->proto.tcp.last_end;
919                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
920                                 ct->proto.tcp.last_end;
921                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
922                                 ct->proto.tcp.last_win == 0 ?
923                                         1 : ct->proto.tcp.last_win;
924                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
925                                 ct->proto.tcp.last_wscale;
926                         ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
927                         ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
928                                 ct->proto.tcp.last_flags;
929                         memset(&ct->proto.tcp.seen[dir], 0,
930                                sizeof(struct ip_ct_tcp_state));
931                         break;
932                 }
933                 ct->proto.tcp.last_index = index;
934                 ct->proto.tcp.last_dir = dir;
935                 ct->proto.tcp.last_seq = ntohl(th->seq);
936                 ct->proto.tcp.last_end =
937                     segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
938                 ct->proto.tcp.last_win = ntohs(th->window);
939
940                 /* a) This is a SYN in ORIGINAL. The client and the server
941                  * may be in sync but we are not. In that case, we annotate
942                  * the TCP options and let the packet go through. If it is a
943                  * valid SYN packet, the server will reply with a SYN/ACK, and
944                  * then we'll get in sync. Otherwise, the server potentially
945                  * responds with a challenge ACK if implementing RFC5961.
946                  */
947                 if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
948                         struct ip_ct_tcp_state seen = {};
949
950                         ct->proto.tcp.last_flags =
951                         ct->proto.tcp.last_wscale = 0;
952                         tcp_options(skb, dataoff, th, &seen);
953                         if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
954                                 ct->proto.tcp.last_flags |=
955                                         IP_CT_TCP_FLAG_WINDOW_SCALE;
956                                 ct->proto.tcp.last_wscale = seen.td_scale;
957                         }
958                         if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
959                                 ct->proto.tcp.last_flags |=
960                                         IP_CT_TCP_FLAG_SACK_PERM;
961                         }
962                         /* Mark the potential for RFC5961 challenge ACK,
963                          * this pose a special problem for LAST_ACK state
964                          * as ACK is intrepretated as ACKing last FIN.
965                          */
966                         if (old_state == TCP_CONNTRACK_LAST_ACK)
967                                 ct->proto.tcp.last_flags |=
968                                         IP_CT_EXP_CHALLENGE_ACK;
969                 }
970                 spin_unlock_bh(&ct->lock);
971                 nf_ct_l4proto_log_invalid(skb, ct,
972                                           "packet (index %d) in dir %d ignored, state %s",
973                                           index, dir,
974                                           tcp_conntrack_names[old_state]);
975                 return NF_ACCEPT;
976         case TCP_CONNTRACK_MAX:
977                 /* Special case for SYN proxy: when the SYN to the server or
978                  * the SYN/ACK from the server is lost, the client may transmit
979                  * a keep-alive packet while in SYN_SENT state. This needs to
980                  * be associated with the original conntrack entry in order to
981                  * generate a new SYN with the correct sequence number.
982                  */
983                 if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
984                     index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
985                     ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
986                     ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
987                         pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
988                         spin_unlock_bh(&ct->lock);
989                         return NF_ACCEPT;
990                 }
991
992                 /* Invalid packet */
993                 pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
994                          dir, get_conntrack_index(th), old_state);
995                 spin_unlock_bh(&ct->lock);
996                 nf_ct_l4proto_log_invalid(skb, ct, "invalid state");
997                 return -NF_ACCEPT;
998         case TCP_CONNTRACK_TIME_WAIT:
999                 /* RFC5961 compliance cause stack to send "challenge-ACK"
1000                  * e.g. in response to spurious SYNs.  Conntrack MUST
1001                  * not believe this ACK is acking last FIN.
1002                  */
1003                 if (old_state == TCP_CONNTRACK_LAST_ACK &&
1004                     index == TCP_ACK_SET &&
1005                     ct->proto.tcp.last_dir != dir &&
1006                     ct->proto.tcp.last_index == TCP_SYN_SET &&
1007                     (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
1008                         /* Detected RFC5961 challenge ACK */
1009                         ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
1010                         spin_unlock_bh(&ct->lock);
1011                         nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored");
1012                         return NF_ACCEPT; /* Don't change state */
1013                 }
1014                 break;
1015         case TCP_CONNTRACK_SYN_SENT2:
1016                 /* tcp_conntracks table is not smart enough to handle
1017                  * simultaneous open.
1018                  */
1019                 ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
1020                 break;
1021         case TCP_CONNTRACK_SYN_RECV:
1022                 if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
1023                     ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
1024                         new_state = TCP_CONNTRACK_ESTABLISHED;
1025                 break;
1026         case TCP_CONNTRACK_CLOSE:
1027                 if (index != TCP_RST_SET)
1028                         break;
1029
1030                 if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
1031                         u32 seq = ntohl(th->seq);
1032
1033                         if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
1034                                 /* Invalid RST  */
1035                                 spin_unlock_bh(&ct->lock);
1036                                 nf_ct_l4proto_log_invalid(skb, ct, "invalid rst");
1037                                 return -NF_ACCEPT;
1038                         }
1039
1040                         if (!nf_conntrack_tcp_established(ct) ||
1041                             seq == ct->proto.tcp.seen[!dir].td_maxack)
1042                                 break;
1043
1044                         /* Check if rst is part of train, such as
1045                          *   foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
1046                          *   foo:80 > bar:4379: R, 235946602:235946602(0)  ack 42
1047                          */
1048                         if (ct->proto.tcp.last_index == TCP_ACK_SET &&
1049                             ct->proto.tcp.last_dir == dir &&
1050                             seq == ct->proto.tcp.last_end)
1051                                 break;
1052
1053                         /* ... RST sequence number doesn't match exactly, keep
1054                          * established state to allow a possible challenge ACK.
1055                          */
1056                         new_state = old_state;
1057                 }
1058                 if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
1059                          && ct->proto.tcp.last_index == TCP_SYN_SET)
1060                         || (!test_bit(IPS_ASSURED_BIT, &ct->status)
1061                             && ct->proto.tcp.last_index == TCP_ACK_SET))
1062                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
1063                         /* RST sent to invalid SYN or ACK we had let through
1064                          * at a) and c) above:
1065                          *
1066                          * a) SYN was in window then
1067                          * c) we hold a half-open connection.
1068                          *
1069                          * Delete our connection entry.
1070                          * We skip window checking, because packet might ACK
1071                          * segments we ignored. */
1072                         goto in_window;
1073                 }
1074                 break;
1075         default:
1076                 /* Keep compilers happy. */
1077                 break;
1078         }
1079
1080         if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
1081                            skb, dataoff, th)) {
1082                 spin_unlock_bh(&ct->lock);
1083                 return -NF_ACCEPT;
1084         }
1085      in_window:
1086         /* From now on we have got in-window packets */
1087         ct->proto.tcp.last_index = index;
1088         ct->proto.tcp.last_dir = dir;
1089
1090         pr_debug("tcp_conntracks: ");
1091         nf_ct_dump_tuple(tuple);
1092         pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1093                  (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1094                  (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1095                  old_state, new_state);
1096
1097         ct->proto.tcp.state = new_state;
1098         if (old_state != new_state
1099             && new_state == TCP_CONNTRACK_FIN_WAIT)
1100                 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1101
1102         timeouts = nf_ct_timeout_lookup(ct);
1103         if (!timeouts)
1104                 timeouts = tn->timeouts;
1105
1106         if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1107             timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1108                 timeout = timeouts[TCP_CONNTRACK_RETRANS];
1109         else if (unlikely(index == TCP_RST_SET))
1110                 timeout = timeouts[TCP_CONNTRACK_CLOSE];
1111         else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1112                  IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
1113                  timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1114                 timeout = timeouts[TCP_CONNTRACK_UNACK];
1115         else if (ct->proto.tcp.last_win == 0 &&
1116                  timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1117                 timeout = timeouts[TCP_CONNTRACK_RETRANS];
1118         else
1119                 timeout = timeouts[new_state];
1120         spin_unlock_bh(&ct->lock);
1121
1122         if (new_state != old_state)
1123                 nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1124
1125         if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1126                 /* If only reply is a RST, we can consider ourselves not to
1127                    have an established connection: this is a fairly common
1128                    problem case, so we can delete the conntrack
1129                    immediately.  --RR */
1130                 if (th->rst) {
1131                         nf_ct_kill_acct(ct, ctinfo, skb);
1132                         return NF_ACCEPT;
1133                 }
1134                 /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1135                  * pickup with loose=1. Avoid large ESTABLISHED timeout.
1136                  */
1137                 if (new_state == TCP_CONNTRACK_ESTABLISHED &&
1138                     timeout > timeouts[TCP_CONNTRACK_UNACK])
1139                         timeout = timeouts[TCP_CONNTRACK_UNACK];
1140         } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1141                    && (old_state == TCP_CONNTRACK_SYN_RECV
1142                        || old_state == TCP_CONNTRACK_ESTABLISHED)
1143                    && new_state == TCP_CONNTRACK_ESTABLISHED) {
1144                 /* Set ASSURED if we see valid ack in ESTABLISHED
1145                    after SYN_RECV or a valid answer for a picked up
1146                    connection. */
1147                 set_bit(IPS_ASSURED_BIT, &ct->status);
1148                 nf_conntrack_event_cache(IPCT_ASSURED, ct);
1149         }
1150         nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1151
1152         return NF_ACCEPT;
1153 }
1154
1155 static bool tcp_can_early_drop(const struct nf_conn *ct)
1156 {
1157         switch (ct->proto.tcp.state) {
1158         case TCP_CONNTRACK_FIN_WAIT:
1159         case TCP_CONNTRACK_LAST_ACK:
1160         case TCP_CONNTRACK_TIME_WAIT:
1161         case TCP_CONNTRACK_CLOSE:
1162         case TCP_CONNTRACK_CLOSE_WAIT:
1163                 return true;
1164         default:
1165                 break;
1166         }
1167
1168         return false;
1169 }
1170
1171 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1172
1173 #include <linux/netfilter/nfnetlink.h>
1174 #include <linux/netfilter/nfnetlink_conntrack.h>
1175
1176 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1177                          struct nf_conn *ct, bool destroy)
1178 {
1179         struct nlattr *nest_parms;
1180         struct nf_ct_tcp_flags tmp = {};
1181
1182         spin_lock_bh(&ct->lock);
1183         nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
1184         if (!nest_parms)
1185                 goto nla_put_failure;
1186
1187         if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
1188                 goto nla_put_failure;
1189
1190         if (destroy)
1191                 goto skip_state;
1192
1193         if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1194                        ct->proto.tcp.seen[0].td_scale) ||
1195             nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1196                        ct->proto.tcp.seen[1].td_scale))
1197                 goto nla_put_failure;
1198
1199         tmp.flags = ct->proto.tcp.seen[0].flags;
1200         if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1201                     sizeof(struct nf_ct_tcp_flags), &tmp))
1202                 goto nla_put_failure;
1203
1204         tmp.flags = ct->proto.tcp.seen[1].flags;
1205         if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1206                     sizeof(struct nf_ct_tcp_flags), &tmp))
1207                 goto nla_put_failure;
1208 skip_state:
1209         spin_unlock_bh(&ct->lock);
1210         nla_nest_end(skb, nest_parms);
1211
1212         return 0;
1213
1214 nla_put_failure:
1215         spin_unlock_bh(&ct->lock);
1216         return -1;
1217 }
1218
1219 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1220         [CTA_PROTOINFO_TCP_STATE]           = { .type = NLA_U8 },
1221         [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1222         [CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1223         [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1224         [CTA_PROTOINFO_TCP_FLAGS_REPLY]     = { .len = sizeof(struct nf_ct_tcp_flags) },
1225 };
1226
1227 #define TCP_NLATTR_SIZE ( \
1228         NLA_ALIGN(NLA_HDRLEN + 1) + \
1229         NLA_ALIGN(NLA_HDRLEN + 1) + \
1230         NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
1231         NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
1232
1233 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1234 {
1235         struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1236         struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1237         int err;
1238
1239         /* updates could not contain anything about the private
1240          * protocol info, in that case skip the parsing */
1241         if (!pattr)
1242                 return 0;
1243
1244         err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
1245                                           tcp_nla_policy, NULL);
1246         if (err < 0)
1247                 return err;
1248
1249         if (tb[CTA_PROTOINFO_TCP_STATE] &&
1250             nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1251                 return -EINVAL;
1252
1253         spin_lock_bh(&ct->lock);
1254         if (tb[CTA_PROTOINFO_TCP_STATE])
1255                 ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1256
1257         if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1258                 struct nf_ct_tcp_flags *attr =
1259                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1260                 ct->proto.tcp.seen[0].flags &= ~attr->mask;
1261                 ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1262         }
1263
1264         if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1265                 struct nf_ct_tcp_flags *attr =
1266                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1267                 ct->proto.tcp.seen[1].flags &= ~attr->mask;
1268                 ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1269         }
1270
1271         if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1272             tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1273             ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1274             ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1275                 ct->proto.tcp.seen[0].td_scale =
1276                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1277                 ct->proto.tcp.seen[1].td_scale =
1278                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1279         }
1280         spin_unlock_bh(&ct->lock);
1281
1282         return 0;
1283 }
1284
1285 static unsigned int tcp_nlattr_tuple_size(void)
1286 {
1287         static unsigned int size __read_mostly;
1288
1289         if (!size)
1290                 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1291
1292         return size;
1293 }
1294 #endif
1295
1296 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1297
1298 #include <linux/netfilter/nfnetlink.h>
1299 #include <linux/netfilter/nfnetlink_cttimeout.h>
1300
1301 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1302                                      struct net *net, void *data)
1303 {
1304         struct nf_tcp_net *tn = nf_tcp_pernet(net);
1305         unsigned int *timeouts = data;
1306         int i;
1307
1308         if (!timeouts)
1309                 timeouts = tn->timeouts;
1310         /* set default TCP timeouts. */
1311         for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1312                 timeouts[i] = tn->timeouts[i];
1313
1314         if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
1315                 timeouts[TCP_CONNTRACK_SYN_SENT] =
1316                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1317         }
1318
1319         if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1320                 timeouts[TCP_CONNTRACK_SYN_RECV] =
1321                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1322         }
1323         if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1324                 timeouts[TCP_CONNTRACK_ESTABLISHED] =
1325                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1326         }
1327         if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1328                 timeouts[TCP_CONNTRACK_FIN_WAIT] =
1329                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1330         }
1331         if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1332                 timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1333                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1334         }
1335         if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1336                 timeouts[TCP_CONNTRACK_LAST_ACK] =
1337                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1338         }
1339         if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1340                 timeouts[TCP_CONNTRACK_TIME_WAIT] =
1341                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1342         }
1343         if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1344                 timeouts[TCP_CONNTRACK_CLOSE] =
1345                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1346         }
1347         if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1348                 timeouts[TCP_CONNTRACK_SYN_SENT2] =
1349                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1350         }
1351         if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1352                 timeouts[TCP_CONNTRACK_RETRANS] =
1353                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1354         }
1355         if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1356                 timeouts[TCP_CONNTRACK_UNACK] =
1357                         ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1358         }
1359
1360         timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
1361         return 0;
1362 }
1363
1364 static int
1365 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1366 {
1367         const unsigned int *timeouts = data;
1368
1369         if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1370                         htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1371             nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1372                          htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1373             nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1374                          htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1375             nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1376                          htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1377             nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1378                          htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1379             nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1380                          htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1381             nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1382                          htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1383             nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1384                          htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1385             nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1386                          htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1387             nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1388                          htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1389             nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1390                          htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1391                 goto nla_put_failure;
1392         return 0;
1393
1394 nla_put_failure:
1395         return -ENOSPC;
1396 }
1397
1398 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1399         [CTA_TIMEOUT_TCP_SYN_SENT]      = { .type = NLA_U32 },
1400         [CTA_TIMEOUT_TCP_SYN_RECV]      = { .type = NLA_U32 },
1401         [CTA_TIMEOUT_TCP_ESTABLISHED]   = { .type = NLA_U32 },
1402         [CTA_TIMEOUT_TCP_FIN_WAIT]      = { .type = NLA_U32 },
1403         [CTA_TIMEOUT_TCP_CLOSE_WAIT]    = { .type = NLA_U32 },
1404         [CTA_TIMEOUT_TCP_LAST_ACK]      = { .type = NLA_U32 },
1405         [CTA_TIMEOUT_TCP_TIME_WAIT]     = { .type = NLA_U32 },
1406         [CTA_TIMEOUT_TCP_CLOSE]         = { .type = NLA_U32 },
1407         [CTA_TIMEOUT_TCP_SYN_SENT2]     = { .type = NLA_U32 },
1408         [CTA_TIMEOUT_TCP_RETRANS]       = { .type = NLA_U32 },
1409         [CTA_TIMEOUT_TCP_UNACK]         = { .type = NLA_U32 },
1410 };
1411 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1412
1413 void nf_conntrack_tcp_init_net(struct net *net)
1414 {
1415         struct nf_tcp_net *tn = nf_tcp_pernet(net);
1416         int i;
1417
1418         for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1419                 tn->timeouts[i] = tcp_timeouts[i];
1420
1421         /* timeouts[0] is unused, make it same as SYN_SENT so
1422          * ->timeouts[0] contains 'new' timeout, like udp or icmp.
1423          */
1424         tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
1425
1426         /* If it is set to zero, we disable picking up already established
1427          * connections.
1428          */
1429         tn->tcp_loose = 1;
1430
1431         /* "Be conservative in what you do,
1432          *  be liberal in what you accept from others."
1433          * If it's non-zero, we mark only out of window RST segments as INVALID.
1434          */
1435         tn->tcp_be_liberal = 0;
1436
1437         /* Max number of the retransmitted packets without receiving an (acceptable)
1438          * ACK from the destination. If this number is reached, a shorter timer
1439          * will be started.
1440          */
1441         tn->tcp_max_retrans = 3;
1442 }
1443
1444 const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1445 {
1446         .l4proto                = IPPROTO_TCP,
1447 #ifdef CONFIG_NF_CONNTRACK_PROCFS
1448         .print_conntrack        = tcp_print_conntrack,
1449 #endif
1450         .can_early_drop         = tcp_can_early_drop,
1451 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1452         .to_nlattr              = tcp_to_nlattr,
1453         .from_nlattr            = nlattr_to_tcp,
1454         .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
1455         .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
1456         .nlattr_tuple_size      = tcp_nlattr_tuple_size,
1457         .nlattr_size            = TCP_NLATTR_SIZE,
1458         .nla_policy             = nf_ct_port_nla_policy,
1459 #endif
1460 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1461         .ctnl_timeout           = {
1462                 .nlattr_to_obj  = tcp_timeout_nlattr_to_obj,
1463                 .obj_to_nlattr  = tcp_timeout_obj_to_nlattr,
1464                 .nlattr_max     = CTA_TIMEOUT_TCP_MAX,
1465                 .obj_size       = sizeof(unsigned int) *
1466                                         TCP_CONNTRACK_TIMEOUT_MAX,
1467                 .nla_policy     = tcp_timeout_nla_policy,
1468         },
1469 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1470 };