net/netfilter/nf_conntrack_core.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Connection state tracking for netfilter.  This is separated from,
   3    but required by, the NAT layer; it can also be used by an iptables
   4    extension. */
   5
   6 /* (C) 1999-2001 Paul `Rusty' Russell
   7  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   8  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   9  * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
  10  */
  11
  12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14 #include <linux/types.h>
  15 #include <linux/netfilter.h>
  16 #include <linux/module.h>
  17 #include <linux/sched.h>
  18 #include <linux/skbuff.h>
  19 #include <linux/proc_fs.h>
  20 #include <linux/vmalloc.h>
  21 #include <linux/stddef.h>
  22 #include <linux/slab.h>
  23 #include <linux/random.h>
  24 #include <linux/siphash.h>
  25 #include <linux/err.h>
  26 #include <linux/percpu.h>
  27 #include <linux/moduleparam.h>
  28 #include <linux/notifier.h>
  29 #include <linux/kernel.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/socket.h>
  32 #include <linux/mm.h>
  33 #include <linux/nsproxy.h>
  34 #include <linux/rculist_nulls.h>
  35
  36 #include <net/netfilter/nf_conntrack.h>
  37 #include <net/netfilter/nf_conntrack_bpf.h>
  38 #include <net/netfilter/nf_conntrack_l4proto.h>
  39 #include <net/netfilter/nf_conntrack_expect.h>
  40 #include <net/netfilter/nf_conntrack_helper.h>
  41 #include <net/netfilter/nf_conntrack_core.h>
  42 #include <net/netfilter/nf_conntrack_extend.h>
  43 #include <net/netfilter/nf_conntrack_acct.h>
  44 #include <net/netfilter/nf_conntrack_ecache.h>
  45 #include <net/netfilter/nf_conntrack_zones.h>
  46 #include <net/netfilter/nf_conntrack_timestamp.h>
  47 #include <net/netfilter/nf_conntrack_timeout.h>
  48 #include <net/netfilter/nf_conntrack_labels.h>
  49 #include <net/netfilter/nf_conntrack_synproxy.h>
  50 #include <net/netfilter/nf_nat.h>
  51 #include <net/netfilter/nf_nat_helper.h>
  52 #include <net/netns/hash.h>
  53 #include <net/ip.h>
  54
  55 #include "nf_internals.h"
  56
  57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
  58 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
  59
  60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
  61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
  62
  63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
  64 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
  65
  66 struct conntrack_gc_work {
  67         struct delayed_work     dwork;
  68         u32                     next_bucket;
  69         u32                     avg_timeout;
  70         u32                     start_time;
  71         bool                    exiting;
  72         bool                    early_drop;
  73 };
  74
  75 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
  76 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
  77 static __read_mostly bool nf_conntrack_locks_all;
  78
  79 /* serialize hash resizes and nf_ct_iterate_cleanup */
  80 static DEFINE_MUTEX(nf_conntrack_mutex);
  81
  82 #define GC_SCAN_INTERVAL_MAX    (60ul * HZ)
  83 #define GC_SCAN_INTERVAL_MIN    (1ul * HZ)
  84
  85 /* clamp timeouts to this value (TCP unacked) */
  86 #define GC_SCAN_INTERVAL_CLAMP  (300ul * HZ)
  87
  88 /* large initial bias so that we don't scan often just because we have
  89  * three entries with a 1s timeout.
  90  */
  91 #define GC_SCAN_INTERVAL_INIT   INT_MAX
  92
  93 #define GC_SCAN_MAX_DURATION    msecs_to_jiffies(10)
  94 #define GC_SCAN_EXPIRED_MAX     (64000u / HZ)
  95
  96 #define MIN_CHAINLEN    8u
  97 #define MAX_CHAINLEN    (32u - MIN_CHAINLEN)
  98
  99 static struct conntrack_gc_work conntrack_gc_work;
 100
 101 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
 102 {
 103         /* 1) Acquire the lock */
 104         spin_lock(lock);
 105
 106         /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
 107          * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
 108          */
 109         if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
 110                 return;
 111
 112         /* fast path failed, unlock */
 113         spin_unlock(lock);
 114
 115         /* Slow path 1) get global lock */
 116         spin_lock(&nf_conntrack_locks_all_lock);
 117
 118         /* Slow path 2) get the lock we want */
 119         spin_lock(lock);
 120
 121         /* Slow path 3) release the global lock */
 122         spin_unlock(&nf_conntrack_locks_all_lock);
 123 }
 124 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 125
 126 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 127 {
 128         h1 %= CONNTRACK_LOCKS;
 129         h2 %= CONNTRACK_LOCKS;
 130         spin_unlock(&nf_conntrack_locks[h1]);
 131         if (h1 != h2)
 132                 spin_unlock(&nf_conntrack_locks[h2]);
 133 }
 134
 135 /* return true if we need to recompute hashes (in case hash table was resized) */
 136 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
 137                                      unsigned int h2, unsigned int sequence)
 138 {
 139         h1 %= CONNTRACK_LOCKS;
 140         h2 %= CONNTRACK_LOCKS;
 141         if (h1 <= h2) {
 142                 nf_conntrack_lock(&nf_conntrack_locks[h1]);
 143                 if (h1 != h2)
 144                         spin_lock_nested(&nf_conntrack_locks[h2],
 145                                          SINGLE_DEPTH_NESTING);
 146         } else {
 147                 nf_conntrack_lock(&nf_conntrack_locks[h2]);
 148                 spin_lock_nested(&nf_conntrack_locks[h1],
 149                                  SINGLE_DEPTH_NESTING);
 150         }
 151         if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
 152                 nf_conntrack_double_unlock(h1, h2);
 153                 return true;
 154         }
 155         return false;
 156 }
 157
 158 static void nf_conntrack_all_lock(void)
 159         __acquires(&nf_conntrack_locks_all_lock)
 160 {
 161         int i;
 162
 163         spin_lock(&nf_conntrack_locks_all_lock);
 164
 165         /* For nf_contrack_locks_all, only the latest time when another
 166          * CPU will see an update is controlled, by the "release" of the
 167          * spin_lock below.
 168          * The earliest time is not controlled, an thus KCSAN could detect
 169          * a race when nf_conntract_lock() reads the variable.
 170          * WRITE_ONCE() is used to ensure the compiler will not
 171          * optimize the write.
 172          */
 173         WRITE_ONCE(nf_conntrack_locks_all, true);
 174
 175         for (i = 0; i < CONNTRACK_LOCKS; i++) {
 176                 spin_lock(&nf_conntrack_locks[i]);
 177
 178                 /* This spin_unlock provides the "release" to ensure that
 179                  * nf_conntrack_locks_all==true is visible to everyone that
 180                  * acquired spin_lock(&nf_conntrack_locks[]).
 181                  */
 182                 spin_unlock(&nf_conntrack_locks[i]);
 183         }
 184 }
 185
 186 static void nf_conntrack_all_unlock(void)
 187         __releases(&nf_conntrack_locks_all_lock)
 188 {
 189         /* All prior stores must be complete before we clear
 190          * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
 191          * might observe the false value but not the entire
 192          * critical section.
 193          * It pairs with the smp_load_acquire() in nf_conntrack_lock()
 194          */
 195         smp_store_release(&nf_conntrack_locks_all, false);
 196         spin_unlock(&nf_conntrack_locks_all_lock);
 197 }
 198
 199 unsigned int nf_conntrack_htable_size __read_mostly;
 200 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 201
 202 unsigned int nf_conntrack_max __read_mostly;
 203 EXPORT_SYMBOL_GPL(nf_conntrack_max);
 204 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 205 static siphash_aligned_key_t nf_conntrack_hash_rnd;
 206
 207 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 208                               unsigned int zoneid,
 209                               const struct net *net)
 210 {
 211         struct {
 212                 struct nf_conntrack_man src;
 213                 union nf_inet_addr dst_addr;
 214                 unsigned int zone;
 215                 u32 net_mix;
 216                 u16 dport;
 217                 u16 proto;
 218         } __aligned(SIPHASH_ALIGNMENT) combined;
 219
 220         get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
 221
 222         memset(&combined, 0, sizeof(combined));
 223
 224         /* The direction must be ignored, so handle usable members manually. */
 225         combined.src = tuple->src;
 226         combined.dst_addr = tuple->dst.u3;
 227         combined.zone = zoneid;
 228         combined.net_mix = net_hash_mix(net);
 229         combined.dport = (__force __u16)tuple->dst.u.all;
 230         combined.proto = tuple->dst.protonum;
 231
 232         return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
 233 }
 234
 235 static u32 scale_hash(u32 hash)
 236 {
 237         return reciprocal_scale(hash, nf_conntrack_htable_size);
 238 }
 239
 240 static u32 __hash_conntrack(const struct net *net,
 241                             const struct nf_conntrack_tuple *tuple,
 242                             unsigned int zoneid,
 243                             unsigned int size)
 244 {
 245         return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
 246 }
 247
 248 static u32 hash_conntrack(const struct net *net,
 249                           const struct nf_conntrack_tuple *tuple,
 250                           unsigned int zoneid)
 251 {
 252         return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
 253 }
 254
 255 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
 256                                   unsigned int dataoff,
 257                                   struct nf_conntrack_tuple *tuple)
 258 {       struct {
 259                 __be16 sport;
 260                 __be16 dport;
 261         } _inet_hdr, *inet_hdr;
 262
 263         /* Actually only need first 4 bytes to get ports. */
 264         inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
 265         if (!inet_hdr)
 266                 return false;
 267
 268         tuple->src.u.udp.port = inet_hdr->sport;
 269         tuple->dst.u.udp.port = inet_hdr->dport;
 270         return true;
 271 }
 272
 273 static bool
 274 nf_ct_get_tuple(const struct sk_buff *skb,
 275                 unsigned int nhoff,
 276                 unsigned int dataoff,
 277                 u_int16_t l3num,
 278                 u_int8_t protonum,
 279                 struct net *net,
 280                 struct nf_conntrack_tuple *tuple)
 281 {
 282         unsigned int size;
 283         const __be32 *ap;
 284         __be32 _addrs[8];
 285
 286         memset(tuple, 0, sizeof(*tuple));
 287
 288         tuple->src.l3num = l3num;
 289         switch (l3num) {
 290         case NFPROTO_IPV4:
 291                 nhoff += offsetof(struct iphdr, saddr);
 292                 size = 2 * sizeof(__be32);
 293                 break;
 294         case NFPROTO_IPV6:
 295                 nhoff += offsetof(struct ipv6hdr, saddr);
 296                 size = sizeof(_addrs);
 297                 break;
 298         default:
 299                 return true;
 300         }
 301
 302         ap = skb_header_pointer(skb, nhoff, size, _addrs);
 303         if (!ap)
 304                 return false;
 305
 306         switch (l3num) {
 307         case NFPROTO_IPV4:
 308                 tuple->src.u3.ip = ap[0];
 309                 tuple->dst.u3.ip = ap[1];
 310                 break;
 311         case NFPROTO_IPV6:
 312                 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
 313                 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
 314                 break;
 315         }
 316
 317         tuple->dst.protonum = protonum;
 318         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 319
 320         switch (protonum) {
 321 #if IS_ENABLED(CONFIG_IPV6)
 322         case IPPROTO_ICMPV6:
 323                 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
 324 #endif
 325         case IPPROTO_ICMP:
 326                 return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
 327 #ifdef CONFIG_NF_CT_PROTO_GRE
 328         case IPPROTO_GRE:
 329                 return gre_pkt_to_tuple(skb, dataoff, net, tuple);
 330 #endif
 331         case IPPROTO_TCP:
 332         case IPPROTO_UDP: /* fallthrough */
 333                 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 334 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
 335         case IPPROTO_UDPLITE:
 336                 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 337 #endif
 338 #ifdef CONFIG_NF_CT_PROTO_SCTP
 339         case IPPROTO_SCTP:
 340                 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 341 #endif
 342 #ifdef CONFIG_NF_CT_PROTO_DCCP
 343         case IPPROTO_DCCP:
 344                 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 345 #endif
 346         default:
 347                 break;
 348         }
 349
 350         return true;
 351 }
 352
 353 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 354                             u_int8_t *protonum)
 355 {
 356         int dataoff = -1;
 357         const struct iphdr *iph;
 358         struct iphdr _iph;
 359
 360         iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
 361         if (!iph)
 362                 return -1;
 363
 364         /* Conntrack defragments packets, we might still see fragments
 365          * inside ICMP packets though.
 366          */
 367         if (iph->frag_off & htons(IP_OFFSET))
 368                 return -1;
 369
 370         dataoff = nhoff + (iph->ihl << 2);
 371         *protonum = iph->protocol;
 372
 373         /* Check bogus IP headers */
 374         if (dataoff > skb->len) {
 375                 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
 376                          nhoff, iph->ihl << 2, skb->len);
 377                 return -1;
 378         }
 379         return dataoff;
 380 }
 381
 382 #if IS_ENABLED(CONFIG_IPV6)
 383 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 384                             u8 *protonum)
 385 {
 386         int protoff = -1;
 387         unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
 388         __be16 frag_off;
 389         u8 nexthdr;
 390
 391         if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
 392                           &nexthdr, sizeof(nexthdr)) != 0) {
 393                 pr_debug("can't get nexthdr\n");
 394                 return -1;
 395         }
 396         protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
 397         /*
 398          * (protoff == skb->len) means the packet has not data, just
 399          * IPv6 and possibly extensions headers, but it is tracked anyway
 400          */
 401         if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
 402                 pr_debug("can't find proto in pkt\n");
 403                 return -1;
 404         }
 405
 406         *protonum = nexthdr;
 407         return protoff;
 408 }
 409 #endif
 410
 411 static int get_l4proto(const struct sk_buff *skb,
 412                        unsigned int nhoff, u8 pf, u8 *l4num)
 413 {
 414         switch (pf) {
 415         case NFPROTO_IPV4:
 416                 return ipv4_get_l4proto(skb, nhoff, l4num);
 417 #if IS_ENABLED(CONFIG_IPV6)
 418         case NFPROTO_IPV6:
 419                 return ipv6_get_l4proto(skb, nhoff, l4num);
 420 #endif
 421         default:
 422                 *l4num = 0;
 423                 break;
 424         }
 425         return -1;
 426 }
 427
 428 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 429                        u_int16_t l3num,
 430                        struct net *net, struct nf_conntrack_tuple *tuple)
 431 {
 432         u8 protonum;
 433         int protoff;
 434
 435         protoff = get_l4proto(skb, nhoff, l3num, &protonum);
 436         if (protoff <= 0)
 437                 return false;
 438
 439         return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
 440 }
 441 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 442
 443 bool
 444 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 445                    const struct nf_conntrack_tuple *orig)
 446 {
 447         memset(inverse, 0, sizeof(*inverse));
 448
 449         inverse->src.l3num = orig->src.l3num;
 450
 451         switch (orig->src.l3num) {
 452         case NFPROTO_IPV4:
 453                 inverse->src.u3.ip = orig->dst.u3.ip;
 454                 inverse->dst.u3.ip = orig->src.u3.ip;
 455                 break;
 456         case NFPROTO_IPV6:
 457                 inverse->src.u3.in6 = orig->dst.u3.in6;
 458                 inverse->dst.u3.in6 = orig->src.u3.in6;
 459                 break;
 460         default:
 461                 break;
 462         }
 463
 464         inverse->dst.dir = !orig->dst.dir;
 465
 466         inverse->dst.protonum = orig->dst.protonum;
 467
 468         switch (orig->dst.protonum) {
 469         case IPPROTO_ICMP:
 470                 return nf_conntrack_invert_icmp_tuple(inverse, orig);
 471 #if IS_ENABLED(CONFIG_IPV6)
 472         case IPPROTO_ICMPV6:
 473                 return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
 474 #endif
 475         }
 476
 477         inverse->src.u.all = orig->dst.u.all;
 478         inverse->dst.u.all = orig->src.u.all;
 479         return true;
 480 }
 481 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 482
 483 /* Generate a almost-unique pseudo-id for a given conntrack.
 484  *
 485  * intentionally doesn't re-use any of the seeds used for hash
 486  * table location, we assume id gets exposed to userspace.
 487  *
 488  * Following nf_conn items do not change throughout lifetime
 489  * of the nf_conn:
 490  *
 491  * 1. nf_conn address
 492  * 2. nf_conn->master address (normally NULL)
 493  * 3. the associated net namespace
 494  * 4. the original direction tuple
 495  */
 496 u32 nf_ct_get_id(const struct nf_conn *ct)
 497 {
 498         static siphash_aligned_key_t ct_id_seed;
 499         unsigned long a, b, c, d;
 500
 501         net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
 502
 503         a = (unsigned long)ct;
 504         b = (unsigned long)ct->master;
 505         c = (unsigned long)nf_ct_net(ct);
 506         d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 507                                    sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
 508                                    &ct_id_seed);
 509 #ifdef CONFIG_64BIT
 510         return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
 511 #else
 512         return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
 513 #endif
 514 }
 515 EXPORT_SYMBOL_GPL(nf_ct_get_id);
 516
 517 static void
 518 clean_from_lists(struct nf_conn *ct)
 519 {
 520         pr_debug("clean_from_lists(%p)\n", ct);
 521         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 522         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
 523
 524         /* Destroy all pending expectations */
 525         nf_ct_remove_expectations(ct);
 526 }
 527
 528 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
 529
 530 /* Released via nf_ct_destroy() */
 531 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 532                                  const struct nf_conntrack_zone *zone,
 533                                  gfp_t flags)
 534 {
 535         struct nf_conn *tmpl, *p;
 536
 537         if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
 538                 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
 539                 if (!tmpl)
 540                         return NULL;
 541
 542                 p = tmpl;
 543                 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 544                 if (tmpl != p) {
 545                         tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 546                         tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
 547                 }
 548         } else {
 549                 tmpl = kzalloc(sizeof(*tmpl), flags);
 550                 if (!tmpl)
 551                         return NULL;
 552         }
 553
 554         tmpl->status = IPS_TEMPLATE;
 555         write_pnet(&tmpl->ct_net, net);
 556         nf_ct_zone_add(tmpl, zone);
 557         refcount_set(&tmpl->ct_general.use, 1);
 558
 559         return tmpl;
 560 }
 561 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 562
 563 void nf_ct_tmpl_free(struct nf_conn *tmpl)
 564 {
 565         kfree(tmpl->ext);
 566
 567         if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
 568                 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
 569         else
 570                 kfree(tmpl);
 571 }
 572 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 573
 574 static void destroy_gre_conntrack(struct nf_conn *ct)
 575 {
 576 #ifdef CONFIG_NF_CT_PROTO_GRE
 577         struct nf_conn *master = ct->master;
 578
 579         if (master)
 580                 nf_ct_gre_keymap_destroy(master);
 581 #endif
 582 }
 583
 584 void nf_ct_destroy(struct nf_conntrack *nfct)
 585 {
 586         struct nf_conn *ct = (struct nf_conn *)nfct;
 587
 588         pr_debug("%s(%p)\n", __func__, ct);
 589         WARN_ON(refcount_read(&nfct->use) != 0);
 590
 591         if (unlikely(nf_ct_is_template(ct))) {
 592                 nf_ct_tmpl_free(ct);
 593                 return;
 594         }
 595
 596         if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
 597                 destroy_gre_conntrack(ct);
 598
 599         /* Expectations will have been removed in clean_from_lists,
 600          * except TFTP can create an expectation on the first packet,
 601          * before connection is in the list, so we need to clean here,
 602          * too.
 603          */
 604         nf_ct_remove_expectations(ct);
 605
 606         if (ct->master)
 607                 nf_ct_put(ct->master);
 608
 609         pr_debug("%s: returning ct=%p to slab\n", __func__, ct);
 610         nf_conntrack_free(ct);
 611 }
 612 EXPORT_SYMBOL(nf_ct_destroy);
 613
 614 static void __nf_ct_delete_from_lists(struct nf_conn *ct)
 615 {
 616         struct net *net = nf_ct_net(ct);
 617         unsigned int hash, reply_hash;
 618         unsigned int sequence;
 619
 620         do {
 621                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 622                 hash = hash_conntrack(net,
 623                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 624                                       nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
 625                 reply_hash = hash_conntrack(net,
 626                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 627                                            nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
 628         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 629
 630         clean_from_lists(ct);
 631         nf_conntrack_double_unlock(hash, reply_hash);
 632 }
 633
 634 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 635 {
 636         nf_ct_helper_destroy(ct);
 637         local_bh_disable();
 638
 639         __nf_ct_delete_from_lists(ct);
 640
 641         local_bh_enable();
 642 }
 643
 644 static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
 645 {
 646 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 647         struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));
 648
 649         spin_lock(&cnet->ecache.dying_lock);
 650         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 651                                  &cnet->ecache.dying_list);
 652         spin_unlock(&cnet->ecache.dying_lock);
 653 #endif
 654 }
 655
 656 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 657 {
 658         struct nf_conn_tstamp *tstamp;
 659         struct net *net;
 660
 661         if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
 662                 return false;
 663
 664         tstamp = nf_conn_tstamp_find(ct);
 665         if (tstamp) {
 666                 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
 667
 668                 tstamp->stop = ktime_get_real_ns();
 669                 if (timeout < 0)
 670                         tstamp->stop -= jiffies_to_nsecs(-timeout);
 671         }
 672
 673         if (nf_conntrack_event_report(IPCT_DESTROY, ct,
 674                                     portid, report) < 0) {
 675                 /* destroy event was not delivered. nf_ct_put will
 676                  * be done by event cache worker on redelivery.
 677                  */
 678                 nf_ct_helper_destroy(ct);
 679                 local_bh_disable();
 680                 __nf_ct_delete_from_lists(ct);
 681                 nf_ct_add_to_ecache_list(ct);
 682                 local_bh_enable();
 683
 684                 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
 685                 return false;
 686         }
 687
 688         net = nf_ct_net(ct);
 689         if (nf_conntrack_ecache_dwork_pending(net))
 690                 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
 691         nf_ct_delete_from_lists(ct);
 692         nf_ct_put(ct);
 693         return true;
 694 }
 695 EXPORT_SYMBOL_GPL(nf_ct_delete);
 696
 697 static inline bool
 698 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 699                 const struct nf_conntrack_tuple *tuple,
 700                 const struct nf_conntrack_zone *zone,
 701                 const struct net *net)
 702 {
 703         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 704
 705         /* A conntrack can be recreated with the equal tuple,
 706          * so we need to check that the conntrack is confirmed
 707          */
 708         return nf_ct_tuple_equal(tuple, &h->tuple) &&
 709                nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 710                nf_ct_is_confirmed(ct) &&
 711                net_eq(net, nf_ct_net(ct));
 712 }
 713
 714 static inline bool
 715 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
 716 {
 717         return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 718                                  &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
 719                nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
 720                                  &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
 721                nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
 722                nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
 723                net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
 724 }
 725
 726 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
 727 static void nf_ct_gc_expired(struct nf_conn *ct)
 728 {
 729         if (!refcount_inc_not_zero(&ct->ct_general.use))
 730                 return;
 731
 732         /* load ->status after refcount increase */
 733         smp_acquire__after_ctrl_dep();
 734
 735         if (nf_ct_should_gc(ct))
 736                 nf_ct_kill(ct);
 737
 738         nf_ct_put(ct);
 739 }
 740
 741 /*
 742  * Warning :
 743  * - Caller must take a reference on returned object
 744  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 745  */
 746 static struct nf_conntrack_tuple_hash *
 747 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 748                       const struct nf_conntrack_tuple *tuple, u32 hash)
 749 {
 750         struct nf_conntrack_tuple_hash *h;
 751         struct hlist_nulls_head *ct_hash;
 752         struct hlist_nulls_node *n;
 753         unsigned int bucket, hsize;
 754
 755 begin:
 756         nf_conntrack_get_ht(&ct_hash, &hsize);
 757         bucket = reciprocal_scale(hash, hsize);
 758
 759         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
 760                 struct nf_conn *ct;
 761
 762                 ct = nf_ct_tuplehash_to_ctrack(h);
 763                 if (nf_ct_is_expired(ct)) {
 764                         nf_ct_gc_expired(ct);
 765                         continue;
 766                 }
 767
 768                 if (nf_ct_key_equal(h, tuple, zone, net))
 769                         return h;
 770         }
 771         /*
 772          * if the nulls value we got at the end of this lookup is
 773          * not the expected one, we must restart lookup.
 774          * We probably met an item that was moved to another chain.
 775          */
 776         if (get_nulls_value(n) != bucket) {
 777                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
 778                 goto begin;
 779         }
 780
 781         return NULL;
 782 }
 783
 784 /* Find a connection corresponding to a tuple. */
 785 static struct nf_conntrack_tuple_hash *
 786 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 787                         const struct nf_conntrack_tuple *tuple, u32 hash)
 788 {
 789         struct nf_conntrack_tuple_hash *h;
 790         struct nf_conn *ct;
 791
 792         rcu_read_lock();
 793
 794         h = ____nf_conntrack_find(net, zone, tuple, hash);
 795         if (h) {
 796                 /* We have a candidate that matches the tuple we're interested
 797                  * in, try to obtain a reference and re-check tuple
 798                  */
 799                 ct = nf_ct_tuplehash_to_ctrack(h);
 800                 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
 801                         /* re-check key after refcount */
 802                         smp_acquire__after_ctrl_dep();
 803
 804                         if (likely(nf_ct_key_equal(h, tuple, zone, net)))
 805                                 goto found;
 806
 807                         /* TYPESAFE_BY_RCU recycled the candidate */
 808                         nf_ct_put(ct);
 809                 }
 810
 811                 h = NULL;
 812         }
 813 found:
 814         rcu_read_unlock();
 815
 816         return h;
 817 }
 818
 819 struct nf_conntrack_tuple_hash *
 820 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 821                       const struct nf_conntrack_tuple *tuple)
 822 {
 823         unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
 824         struct nf_conntrack_tuple_hash *thash;
 825
 826         thash = __nf_conntrack_find_get(net, zone, tuple,
 827                                         hash_conntrack_raw(tuple, zone_id, net));
 828
 829         if (thash)
 830                 return thash;
 831
 832         rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
 833         if (rid != zone_id)
 834                 return __nf_conntrack_find_get(net, zone, tuple,
 835                                                hash_conntrack_raw(tuple, rid, net));
 836         return thash;
 837 }
 838 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 839
 840 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 841                                        unsigned int hash,
 842                                        unsigned int reply_hash)
 843 {
 844         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 845                            &nf_conntrack_hash[hash]);
 846         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 847                            &nf_conntrack_hash[reply_hash]);
 848 }
 849
 850 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
 851 {
 852         /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
 853          * may contain stale pointers to e.g. helper that has been removed.
 854          *
 855          * The helper can't clear this because the nf_conn object isn't in
 856          * any hash and synchronize_rcu() isn't enough because associated skb
 857          * might sit in a queue.
 858          */
 859         return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
 860 }
 861
 862 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
 863 {
 864         if (!ext)
 865                 return true;
 866
 867         if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
 868                 return false;
 869
 870         /* inserted into conntrack table, nf_ct_iterate_cleanup()
 871          * will find it.  Disable nf_ct_ext_find() id check.
 872          */
 873         WRITE_ONCE(ext->gen_id, 0);
 874         return true;
 875 }
 876
 877 int
 878 nf_conntrack_hash_check_insert(struct nf_conn *ct)
 879 {
 880         const struct nf_conntrack_zone *zone;
 881         struct net *net = nf_ct_net(ct);
 882         unsigned int hash, reply_hash;
 883         struct nf_conntrack_tuple_hash *h;
 884         struct hlist_nulls_node *n;
 885         unsigned int max_chainlen;
 886         unsigned int chainlen = 0;
 887         unsigned int sequence;
 888         int err = -EEXIST;
 889
 890         zone = nf_ct_zone(ct);
 891
 892         if (!nf_ct_ext_valid_pre(ct->ext)) {
 893                 NF_CT_STAT_INC(net, insert_failed);
 894                 return -ETIMEDOUT;
 895         }
 896
 897         local_bh_disable();
 898         do {
 899                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 900                 hash = hash_conntrack(net,
 901                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 902                                       nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
 903                 reply_hash = hash_conntrack(net,
 904                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 905                                            nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
 906         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 907
 908         max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
 909
 910         /* See if there's one in the list already, including reverse */
 911         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
 912                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 913                                     zone, net))
 914                         goto out;
 915
 916                 if (chainlen++ > max_chainlen)
 917                         goto chaintoolong;
 918         }
 919
 920         chainlen = 0;
 921
 922         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
 923                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 924                                     zone, net))
 925                         goto out;
 926                 if (chainlen++ > max_chainlen)
 927                         goto chaintoolong;
 928         }
 929
 930         smp_wmb();
 931         /* The caller holds a reference to this object */
 932         refcount_set(&ct->ct_general.use, 2);
 933         __nf_conntrack_hash_insert(ct, hash, reply_hash);
 934         nf_conntrack_double_unlock(hash, reply_hash);
 935         NF_CT_STAT_INC(net, insert);
 936         local_bh_enable();
 937
 938         if (!nf_ct_ext_valid_post(ct->ext)) {
 939                 nf_ct_kill(ct);
 940                 NF_CT_STAT_INC(net, drop);
 941                 return -ETIMEDOUT;
 942         }
 943
 944         return 0;
 945 chaintoolong:
 946         NF_CT_STAT_INC(net, chaintoolong);
 947         err = -ENOSPC;
 948 out:
 949         nf_conntrack_double_unlock(hash, reply_hash);
 950         local_bh_enable();
 951         return err;
 952 }
 953 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 954
 955 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
 956                     unsigned int bytes)
 957 {
 958         struct nf_conn_acct *acct;
 959
 960         acct = nf_conn_acct_find(ct);
 961         if (acct) {
 962                 struct nf_conn_counter *counter = acct->counter;
 963
 964                 atomic64_add(packets, &counter[dir].packets);
 965                 atomic64_add(bytes, &counter[dir].bytes);
 966         }
 967 }
 968 EXPORT_SYMBOL_GPL(nf_ct_acct_add);
 969
 970 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 971                              const struct nf_conn *loser_ct)
 972 {
 973         struct nf_conn_acct *acct;
 974
 975         acct = nf_conn_acct_find(loser_ct);
 976         if (acct) {
 977                 struct nf_conn_counter *counter = acct->counter;
 978                 unsigned int bytes;
 979
 980                 /* u32 should be fine since we must have seen one packet. */
 981                 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
 982                 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
 983         }
 984 }
 985
 986 static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
 987 {
 988         struct nf_conn_tstamp *tstamp;
 989
 990         refcount_inc(&ct->ct_general.use);
 991
 992         /* set conntrack timestamp, if enabled. */
 993         tstamp = nf_conn_tstamp_find(ct);
 994         if (tstamp)
 995                 tstamp->start = ktime_get_real_ns();
 996 }
 997
 998 /* caller must hold locks to prevent concurrent changes */
 999 static int __nf_ct_resolve_clash(struct sk_buff *skb,
1000                                  struct nf_conntrack_tuple_hash *h)
1001 {
1002         /* This is the conntrack entry already in hashes that won race. */
1003         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1004         enum ip_conntrack_info ctinfo;
1005         struct nf_conn *loser_ct;
1006
1007         loser_ct = nf_ct_get(skb, &ctinfo);
1008
1009         if (nf_ct_is_dying(ct))
1010                 return NF_DROP;
1011
1012         if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
1013             nf_ct_match(ct, loser_ct)) {
1014                 struct net *net = nf_ct_net(ct);
1015
1016                 nf_conntrack_get(&ct->ct_general);
1017
1018                 nf_ct_acct_merge(ct, ctinfo, loser_ct);
1019                 nf_ct_put(loser_ct);
1020                 nf_ct_set(skb, ct, ctinfo);
1021
1022                 NF_CT_STAT_INC(net, clash_resolve);
1023                 return NF_ACCEPT;
1024         }
1025
1026         return NF_DROP;
1027 }
1028
1029 /**
1030  * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
1031  *
1032  * @skb: skb that causes the collision
1033  * @repl_idx: hash slot for reply direction
1034  *
1035  * Called when origin or reply direction had a clash.
1036  * The skb can be handled without packet drop provided the reply direction
1037  * is unique or there the existing entry has the identical tuple in both
1038  * directions.
1039  *
1040  * Caller must hold conntrack table locks to prevent concurrent updates.
1041  *
1042  * Returns NF_DROP if the clash could not be handled.
1043  */
1044 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
1045 {
1046         struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
1047         const struct nf_conntrack_zone *zone;
1048         struct nf_conntrack_tuple_hash *h;
1049         struct hlist_nulls_node *n;
1050         struct net *net;
1051
1052         zone = nf_ct_zone(loser_ct);
1053         net = nf_ct_net(loser_ct);
1054
1055         /* Reply direction must never result in a clash, unless both origin
1056          * and reply tuples are identical.
1057          */
1058         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
1059                 if (nf_ct_key_equal(h,
1060                                     &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1061                                     zone, net))
1062                         return __nf_ct_resolve_clash(skb, h);
1063         }
1064
1065         /* We want the clashing entry to go away real soon: 1 second timeout. */
1066         WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
1067
1068         /* IPS_NAT_CLASH removes the entry automatically on the first
1069          * reply.  Also prevents UDP tracker from moving the entry to
1070          * ASSURED state, i.e. the entry can always be evicted under
1071          * pressure.
1072          */
1073         loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
1074
1075         __nf_conntrack_insert_prepare(loser_ct);
1076
1077         /* fake add for ORIGINAL dir: we want lookups to only find the entry
1078          * already in the table.  This also hides the clashing entry from
1079          * ctnetlink iteration, i.e. conntrack -L won't show them.
1080          */
1081         hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
1082
1083         hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
1084                                  &nf_conntrack_hash[repl_idx]);
1085
1086         NF_CT_STAT_INC(net, clash_resolve);
1087         return NF_ACCEPT;
1088 }
1089
1090 /**
1091  * nf_ct_resolve_clash - attempt to handle clash without packet drop
1092  *
1093  * @skb: skb that causes the clash
1094  * @h: tuplehash of the clashing entry already in table
1095  * @reply_hash: hash slot for reply direction
1096  *
1097  * A conntrack entry can be inserted to the connection tracking table
1098  * if there is no existing entry with an identical tuple.
1099  *
1100  * If there is one, @skb (and the assocated, unconfirmed conntrack) has
1101  * to be dropped.  In case @skb is retransmitted, next conntrack lookup
1102  * will find the already-existing entry.
1103  *
1104  * The major problem with such packet drop is the extra delay added by
1105  * the packet loss -- it will take some time for a retransmit to occur
1106  * (or the sender to time out when waiting for a reply).
1107  *
1108  * This function attempts to handle the situation without packet drop.
1109  *
1110  * If @skb has no NAT transformation or if the colliding entries are
1111  * exactly the same, only the to-be-confirmed conntrack entry is discarded
1112  * and @skb is associated with the conntrack entry already in the table.
1113  *
1114  * Failing that, the new, unconfirmed conntrack is still added to the table
1115  * provided that the collision only occurs in the ORIGINAL direction.
1116  * The new entry will be added only in the non-clashing REPLY direction,
1117  * so packets in the ORIGINAL direction will continue to match the existing
1118  * entry.  The new entry will also have a fixed timeout so it expires --
1119  * due to the collision, it will only see reply traffic.
1120  *
1121  * Returns NF_DROP if the clash could not be resolved.
1122  */
1123 static __cold noinline int
1124 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
1125                     u32 reply_hash)
1126 {
1127         /* This is the conntrack entry already in hashes that won race. */
1128         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1129         const struct nf_conntrack_l4proto *l4proto;
1130         enum ip_conntrack_info ctinfo;
1131         struct nf_conn *loser_ct;
1132         struct net *net;
1133         int ret;
1134
1135         loser_ct = nf_ct_get(skb, &ctinfo);
1136         net = nf_ct_net(loser_ct);
1137
1138         l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1139         if (!l4proto->allow_clash)
1140                 goto drop;
1141
1142         ret = __nf_ct_resolve_clash(skb, h);
1143         if (ret == NF_ACCEPT)
1144                 return ret;
1145
1146         ret = nf_ct_resolve_clash_harder(skb, reply_hash);
1147         if (ret == NF_ACCEPT)
1148                 return ret;
1149
1150 drop:
1151         NF_CT_STAT_INC(net, drop);
1152         NF_CT_STAT_INC(net, insert_failed);
1153         return NF_DROP;
1154 }
1155
1156 /* Confirm a connection given skb; places it in hash table */
1157 int
1158 __nf_conntrack_confirm(struct sk_buff *skb)
1159 {
1160         unsigned int chainlen = 0, sequence, max_chainlen;
1161         const struct nf_conntrack_zone *zone;
1162         unsigned int hash, reply_hash;
1163         struct nf_conntrack_tuple_hash *h;
1164         struct nf_conn *ct;
1165         struct nf_conn_help *help;
1166         struct hlist_nulls_node *n;
1167         enum ip_conntrack_info ctinfo;
1168         struct net *net;
1169         int ret = NF_DROP;
1170
1171         ct = nf_ct_get(skb, &ctinfo);
1172         net = nf_ct_net(ct);
1173
1174         /* ipt_REJECT uses nf_conntrack_attach to attach related
1175            ICMP/TCP RST packets in other direction.  Actual packet
1176            which created connection will be IP_CT_NEW or for an
1177            expected connection, IP_CT_RELATED. */
1178         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
1179                 return NF_ACCEPT;
1180
1181         zone = nf_ct_zone(ct);
1182         local_bh_disable();
1183
1184         do {
1185                 sequence = read_seqcount_begin(&nf_conntrack_generation);
1186                 /* reuse the hash saved before */
1187                 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
1188                 hash = scale_hash(hash);
1189                 reply_hash = hash_conntrack(net,
1190                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1191                                            nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
1192         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
1193
1194         /* We're not in hash table, and we refuse to set up related
1195          * connections for unconfirmed conns.  But packet copies and
1196          * REJECT will give spurious warnings here.
1197          */
1198
1199         /* Another skb with the same unconfirmed conntrack may
1200          * win the race. This may happen for bridge(br_flood)
1201          * or broadcast/multicast packets do skb_clone with
1202          * unconfirmed conntrack.
1203          */
1204         if (unlikely(nf_ct_is_confirmed(ct))) {
1205                 WARN_ON_ONCE(1);
1206                 nf_conntrack_double_unlock(hash, reply_hash);
1207                 local_bh_enable();
1208                 return NF_DROP;
1209         }
1210
1211         if (!nf_ct_ext_valid_pre(ct->ext)) {
1212                 NF_CT_STAT_INC(net, insert_failed);
1213                 goto dying;
1214         }
1215
1216         pr_debug("Confirming conntrack %p\n", ct);
1217         /* We have to check the DYING flag after unlink to prevent
1218          * a race against nf_ct_get_next_corpse() possibly called from
1219          * user context, else we insert an already 'dead' hash, blocking
1220          * further use of that particular connection -JM.
1221          */
1222         ct->status |= IPS_CONFIRMED;
1223
1224         if (unlikely(nf_ct_is_dying(ct))) {
1225                 NF_CT_STAT_INC(net, insert_failed);
1226                 goto dying;
1227         }
1228
1229         max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
1230         /* See if there's one in the list already, including reverse:
1231            NAT could have grabbed it without realizing, since we're
1232            not in the hash.  If there is, we lost race. */
1233         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
1234                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1235                                     zone, net))
1236                         goto out;
1237                 if (chainlen++ > max_chainlen)
1238                         goto chaintoolong;
1239         }
1240
1241         chainlen = 0;
1242         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
1243                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1244                                     zone, net))
1245                         goto out;
1246                 if (chainlen++ > max_chainlen) {
1247 chaintoolong:
1248                         NF_CT_STAT_INC(net, chaintoolong);
1249                         NF_CT_STAT_INC(net, insert_failed);
1250                         ret = NF_DROP;
1251                         goto dying;
1252                 }
1253         }
1254
1255         /* Timer relative to confirmation time, not original
1256            setting time, otherwise we'd get timer wrap in
1257            weird delay cases. */
1258         ct->timeout += nfct_time_stamp;
1259
1260         __nf_conntrack_insert_prepare(ct);
1261
1262         /* Since the lookup is lockless, hash insertion must be done after
1263          * starting the timer and setting the CONFIRMED bit. The RCU barriers
1264          * guarantee that no other CPU can find the conntrack before the above
1265          * stores are visible.
1266          */
1267         __nf_conntrack_hash_insert(ct, hash, reply_hash);
1268         nf_conntrack_double_unlock(hash, reply_hash);
1269         local_bh_enable();
1270
1271         /* ext area is still valid (rcu read lock is held,
1272          * but will go out of scope soon, we need to remove
1273          * this conntrack again.
1274          */
1275         if (!nf_ct_ext_valid_post(ct->ext)) {
1276                 nf_ct_kill(ct);
1277                 NF_CT_STAT_INC(net, drop);
1278                 return NF_DROP;
1279         }
1280
1281         help = nfct_help(ct);
1282         if (help && help->helper)
1283                 nf_conntrack_event_cache(IPCT_HELPER, ct);
1284
1285         nf_conntrack_event_cache(master_ct(ct) ?
1286                                  IPCT_RELATED : IPCT_NEW, ct);
1287         return NF_ACCEPT;
1288
1289 out:
1290         ret = nf_ct_resolve_clash(skb, h, reply_hash);
1291 dying:
1292         nf_conntrack_double_unlock(hash, reply_hash);
1293         local_bh_enable();
1294         return ret;
1295 }
1296 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1297
1298 /* Returns true if a connection correspondings to the tuple (required
1299    for NAT). */
1300 int
1301 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1302                          const struct nf_conn *ignored_conntrack)
1303 {
1304         struct net *net = nf_ct_net(ignored_conntrack);
1305         const struct nf_conntrack_zone *zone;
1306         struct nf_conntrack_tuple_hash *h;
1307         struct hlist_nulls_head *ct_hash;
1308         unsigned int hash, hsize;
1309         struct hlist_nulls_node *n;
1310         struct nf_conn *ct;
1311
1312         zone = nf_ct_zone(ignored_conntrack);
1313
1314         rcu_read_lock();
1315  begin:
1316         nf_conntrack_get_ht(&ct_hash, &hsize);
1317         hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);
1318
1319         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1320                 ct = nf_ct_tuplehash_to_ctrack(h);
1321
1322                 if (ct == ignored_conntrack)
1323                         continue;
1324
1325                 if (nf_ct_is_expired(ct)) {
1326                         nf_ct_gc_expired(ct);
1327                         continue;
1328                 }
1329
1330                 if (nf_ct_key_equal(h, tuple, zone, net)) {
1331                         /* Tuple is taken already, so caller will need to find
1332                          * a new source port to use.
1333                          *
1334                          * Only exception:
1335                          * If the *original tuples* are identical, then both
1336                          * conntracks refer to the same flow.
1337                          * This is a rare situation, it can occur e.g. when
1338                          * more than one UDP packet is sent from same socket
1339                          * in different threads.
1340                          *
1341                          * Let nf_ct_resolve_clash() deal with this later.
1342                          */
1343                         if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1344                                               &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
1345                                               nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
1346                                 continue;
1347
1348                         NF_CT_STAT_INC_ATOMIC(net, found);
1349                         rcu_read_unlock();
1350                         return 1;
1351                 }
1352         }
1353
1354         if (get_nulls_value(n) != hash) {
1355                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
1356                 goto begin;
1357         }
1358
1359         rcu_read_unlock();
1360
1361         return 0;
1362 }
1363 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1364
1365 #define NF_CT_EVICTION_RANGE    8
1366
1367 /* There's a small race here where we may free a just-assured
1368    connection.  Too bad: we're in trouble anyway. */
1369 static unsigned int early_drop_list(struct net *net,
1370                                     struct hlist_nulls_head *head)
1371 {
1372         struct nf_conntrack_tuple_hash *h;
1373         struct hlist_nulls_node *n;
1374         unsigned int drops = 0;
1375         struct nf_conn *tmp;
1376
1377         hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1378                 tmp = nf_ct_tuplehash_to_ctrack(h);
1379
1380                 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
1381                         continue;
1382
1383                 if (nf_ct_is_expired(tmp)) {
1384                         nf_ct_gc_expired(tmp);
1385                         continue;
1386                 }
1387
1388                 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
1389                     !net_eq(nf_ct_net(tmp), net) ||
1390                     nf_ct_is_dying(tmp))
1391                         continue;
1392
1393                 if (!refcount_inc_not_zero(&tmp->ct_general.use))
1394                         continue;
1395
1396                 /* load ->ct_net and ->status after refcount increase */
1397                 smp_acquire__after_ctrl_dep();
1398
1399                 /* kill only if still in same netns -- might have moved due to
1400                  * SLAB_TYPESAFE_BY_RCU rules.
1401                  *
1402                  * We steal the timer reference.  If that fails timer has
1403                  * already fired or someone else deleted it. Just drop ref
1404                  * and move to next entry.
1405                  */
1406                 if (net_eq(nf_ct_net(tmp), net) &&
1407                     nf_ct_is_confirmed(tmp) &&
1408                     nf_ct_delete(tmp, 0, 0))
1409                         drops++;
1410
1411                 nf_ct_put(tmp);
1412         }
1413
1414         return drops;
1415 }
1416
1417 static noinline int early_drop(struct net *net, unsigned int hash)
1418 {
1419         unsigned int i, bucket;
1420
1421         for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
1422                 struct hlist_nulls_head *ct_hash;
1423                 unsigned int hsize, drops;
1424
1425                 rcu_read_lock();
1426                 nf_conntrack_get_ht(&ct_hash, &hsize);
1427                 if (!i)
1428                         bucket = reciprocal_scale(hash, hsize);
1429                 else
1430                         bucket = (bucket + 1) % hsize;
1431
1432                 drops = early_drop_list(net, &ct_hash[bucket]);
1433                 rcu_read_unlock();
1434
1435                 if (drops) {
1436                         NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1437                         return true;
1438                 }
1439         }
1440
1441         return false;
1442 }
1443
1444 static bool gc_worker_skip_ct(const struct nf_conn *ct)
1445 {
1446         return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1447 }
1448
1449 static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1450 {
1451         const struct nf_conntrack_l4proto *l4proto;
1452
1453         if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1454                 return true;
1455
1456         l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1457         if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1458                 return true;
1459
1460         return false;
1461 }
1462
1463 static void gc_worker(struct work_struct *work)
1464 {
1465         unsigned int i, hashsz, nf_conntrack_max95 = 0;
1466         u32 end_time, start_time = nfct_time_stamp;
1467         struct conntrack_gc_work *gc_work;
1468         unsigned int expired_count = 0;
1469         unsigned long next_run;
1470         s32 delta_time;
1471
1472         gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1473
1474         i = gc_work->next_bucket;
1475         if (gc_work->early_drop)
1476                 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1477
1478         if (i == 0) {
1479                 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
1480                 gc_work->start_time = start_time;
1481         }
1482
1483         next_run = gc_work->avg_timeout;
1484
1485         end_time = start_time + GC_SCAN_MAX_DURATION;
1486
1487         do {
1488                 struct nf_conntrack_tuple_hash *h;
1489                 struct hlist_nulls_head *ct_hash;
1490                 struct hlist_nulls_node *n;
1491                 struct nf_conn *tmp;
1492
1493                 rcu_read_lock();
1494
1495                 nf_conntrack_get_ht(&ct_hash, &hashsz);
1496                 if (i >= hashsz) {
1497                         rcu_read_unlock();
1498                         break;
1499                 }
1500
1501                 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1502                         struct nf_conntrack_net *cnet;
1503                         unsigned long expires;
1504                         struct net *net;
1505
1506                         tmp = nf_ct_tuplehash_to_ctrack(h);
1507
1508                         if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1509                                 nf_ct_offload_timeout(tmp);
1510                                 continue;
1511                         }
1512
1513                         if (expired_count > GC_SCAN_EXPIRED_MAX) {
1514                                 rcu_read_unlock();
1515
1516                                 gc_work->next_bucket = i;
1517                                 gc_work->avg_timeout = next_run;
1518
1519                                 delta_time = nfct_time_stamp - gc_work->start_time;
1520
1521                                 /* re-sched immediately if total cycle time is exceeded */
1522                                 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
1523                                 goto early_exit;
1524                         }
1525
1526                         if (nf_ct_is_expired(tmp)) {
1527                                 nf_ct_gc_expired(tmp);
1528                                 expired_count++;
1529                                 continue;
1530                         }
1531
1532                         expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
1533                         next_run += expires;
1534                         next_run /= 2u;
1535
1536                         if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1537                                 continue;
1538
1539                         net = nf_ct_net(tmp);
1540                         cnet = nf_ct_pernet(net);
1541                         if (atomic_read(&cnet->count) < nf_conntrack_max95)
1542                                 continue;
1543
1544                         /* need to take reference to avoid possible races */
1545                         if (!refcount_inc_not_zero(&tmp->ct_general.use))
1546                                 continue;
1547
1548                         /* load ->status after refcount increase */
1549                         smp_acquire__after_ctrl_dep();
1550
1551                         if (gc_worker_skip_ct(tmp)) {
1552                                 nf_ct_put(tmp);
1553                                 continue;
1554                         }
1555
1556                         if (gc_worker_can_early_drop(tmp)) {
1557                                 nf_ct_kill(tmp);
1558                                 expired_count++;
1559                         }
1560
1561                         nf_ct_put(tmp);
1562                 }
1563
1564                 /* could check get_nulls_value() here and restart if ct
1565                  * was moved to another chain.  But given gc is best-effort
1566                  * we will just continue with next hash slot.
1567                  */
1568                 rcu_read_unlock();
1569                 cond_resched();
1570                 i++;
1571
1572                 delta_time = nfct_time_stamp - end_time;
1573                 if (delta_time > 0 && i < hashsz) {
1574                         gc_work->avg_timeout = next_run;
1575                         gc_work->next_bucket = i;
1576                         next_run = 0;
1577                         goto early_exit;
1578                 }
1579         } while (i < hashsz);
1580
1581         gc_work->next_bucket = 0;
1582
1583         next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
1584
1585         delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
1586         if (next_run > (unsigned long)delta_time)
1587                 next_run -= delta_time;
1588         else
1589                 next_run = 1;
1590
1591 early_exit:
1592         if (gc_work->exiting)
1593                 return;
1594
1595         if (next_run)
1596                 gc_work->early_drop = false;
1597
1598         queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1599 }
1600
1601 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1602 {
1603         INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
1604         gc_work->exiting = false;
1605 }
1606
1607 static struct nf_conn *
1608 __nf_conntrack_alloc(struct net *net,
1609                      const struct nf_conntrack_zone *zone,
1610                      const struct nf_conntrack_tuple *orig,
1611                      const struct nf_conntrack_tuple *repl,
1612                      gfp_t gfp, u32 hash)
1613 {
1614         struct nf_conntrack_net *cnet = nf_ct_pernet(net);
1615         unsigned int ct_count;
1616         struct nf_conn *ct;
1617
1618         /* We don't want any race condition at early drop stage */
1619         ct_count = atomic_inc_return(&cnet->count);
1620
1621         if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
1622                 if (!early_drop(net, hash)) {
1623                         if (!conntrack_gc_work.early_drop)
1624                                 conntrack_gc_work.early_drop = true;
1625                         atomic_dec(&cnet->count);
1626                         net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1627                         return ERR_PTR(-ENOMEM);
1628                 }
1629         }
1630
1631         /*
1632          * Do not use kmem_cache_zalloc(), as this cache uses
1633          * SLAB_TYPESAFE_BY_RCU.
1634          */
1635         ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1636         if (ct == NULL)
1637                 goto out;
1638
1639         spin_lock_init(&ct->lock);
1640         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1641         ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1642         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1643         /* save hash for reusing when confirming */
1644         *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1645         ct->status = 0;
1646         WRITE_ONCE(ct->timeout, 0);
1647         write_pnet(&ct->ct_net, net);
1648         memset_after(ct, 0, __nfct_init_offset);
1649
1650         nf_ct_zone_add(ct, zone);
1651
1652         /* Because we use RCU lookups, we set ct_general.use to zero before
1653          * this is inserted in any list.
1654          */
1655         refcount_set(&ct->ct_general.use, 0);
1656         return ct;
1657 out:
1658         atomic_dec(&cnet->count);
1659         return ERR_PTR(-ENOMEM);
1660 }
1661
1662 struct nf_conn *nf_conntrack_alloc(struct net *net,
1663                                    const struct nf_conntrack_zone *zone,
1664                                    const struct nf_conntrack_tuple *orig,
1665                                    const struct nf_conntrack_tuple *repl,
1666                                    gfp_t gfp)
1667 {
1668         return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1669 }
1670 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1671
1672 void nf_conntrack_free(struct nf_conn *ct)
1673 {
1674         struct net *net = nf_ct_net(ct);
1675         struct nf_conntrack_net *cnet;
1676
1677         /* A freed object has refcnt == 0, that's
1678          * the golden rule for SLAB_TYPESAFE_BY_RCU
1679          */
1680         WARN_ON(refcount_read(&ct->ct_general.use) != 0);
1681
1682         if (ct->status & IPS_SRC_NAT_DONE) {
1683                 const struct nf_nat_hook *nat_hook;
1684
1685                 rcu_read_lock();
1686                 nat_hook = rcu_dereference(nf_nat_hook);
1687                 if (nat_hook)
1688                         nat_hook->remove_nat_bysrc(ct);
1689                 rcu_read_unlock();
1690         }
1691
1692         kfree(ct->ext);
1693         kmem_cache_free(nf_conntrack_cachep, ct);
1694         cnet = nf_ct_pernet(net);
1695
1696         smp_mb__before_atomic();
1697         atomic_dec(&cnet->count);
1698 }
1699 EXPORT_SYMBOL_GPL(nf_conntrack_free);
1700
1701
1702 /* Allocate a new conntrack: we return -ENOMEM if classification
1703    failed due to stress.  Otherwise it really is unclassifiable. */
1704 static noinline struct nf_conntrack_tuple_hash *
1705 init_conntrack(struct net *net, struct nf_conn *tmpl,
1706                const struct nf_conntrack_tuple *tuple,
1707                struct sk_buff *skb,
1708                unsigned int dataoff, u32 hash)
1709 {
1710         struct nf_conn *ct;
1711         struct nf_conn_help *help;
1712         struct nf_conntrack_tuple repl_tuple;
1713 #ifdef CONFIG_NF_CONNTRACK_EVENTS
1714         struct nf_conntrack_ecache *ecache;
1715 #endif
1716         struct nf_conntrack_expect *exp = NULL;
1717         const struct nf_conntrack_zone *zone;
1718         struct nf_conn_timeout *timeout_ext;
1719         struct nf_conntrack_zone tmp;
1720         struct nf_conntrack_net *cnet;
1721
1722         if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
1723                 pr_debug("Can't invert tuple.\n");
1724                 return NULL;
1725         }
1726
1727         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1728         ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1729                                   hash);
1730         if (IS_ERR(ct))
1731                 return (struct nf_conntrack_tuple_hash *)ct;
1732
1733         if (!nf_ct_add_synproxy(ct, tmpl)) {
1734                 nf_conntrack_free(ct);
1735                 return ERR_PTR(-ENOMEM);
1736         }
1737
1738         timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1739
1740         if (timeout_ext)
1741                 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1742                                       GFP_ATOMIC);
1743
1744         nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1745         nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1746         nf_ct_labels_ext_add(ct);
1747
1748 #ifdef CONFIG_NF_CONNTRACK_EVENTS
1749         ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1750
1751         if ((ecache || net->ct.sysctl_events) &&
1752             !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1753                                   ecache ? ecache->expmask : 0,
1754                                   GFP_ATOMIC)) {
1755                 nf_conntrack_free(ct);
1756                 return ERR_PTR(-ENOMEM);
1757         }
1758 #endif
1759
1760         cnet = nf_ct_pernet(net);
1761         if (cnet->expect_count) {
1762                 spin_lock_bh(&nf_conntrack_expect_lock);
1763                 exp = nf_ct_find_expectation(net, zone, tuple);
1764                 if (exp) {
1765                         pr_debug("expectation arrives ct=%p exp=%p\n",
1766                                  ct, exp);
1767                         /* Welcome, Mr. Bond.  We've been expecting you... */
1768                         __set_bit(IPS_EXPECTED_BIT, &ct->status);
1769                         /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1770                         ct->master = exp->master;
1771                         if (exp->helper) {
1772                                 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1773                                 if (help)
1774                                         rcu_assign_pointer(help->helper, exp->helper);
1775                         }
1776
1777 #ifdef CONFIG_NF_CONNTRACK_MARK
1778                         ct->mark = exp->master->mark;
1779 #endif
1780 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1781                         ct->secmark = exp->master->secmark;
1782 #endif
1783                         NF_CT_STAT_INC(net, expect_new);
1784                 }
1785                 spin_unlock_bh(&nf_conntrack_expect_lock);
1786         }
1787         if (!exp)
1788                 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1789
1790         /* Other CPU might have obtained a pointer to this object before it was
1791          * released.  Because refcount is 0, refcount_inc_not_zero() will fail.
1792          *
1793          * After refcount_set(1) it will succeed; ensure that zeroing of
1794          * ct->status and the correct ct->net pointer are visible; else other
1795          * core might observe CONFIRMED bit which means the entry is valid and
1796          * in the hash table, but its not (anymore).
1797          */
1798         smp_wmb();
1799
1800         /* Now it is going to be associated with an sk_buff, set refcount to 1. */
1801         refcount_set(&ct->ct_general.use, 1);
1802
1803         if (exp) {
1804                 if (exp->expectfn)
1805                         exp->expectfn(ct, exp);
1806                 nf_ct_expect_put(exp);
1807         }
1808
1809         return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1810 }
1811
1812 /* On success, returns 0, sets skb->_nfct | ctinfo */
1813 static int
1814 resolve_normal_ct(struct nf_conn *tmpl,
1815                   struct sk_buff *skb,
1816                   unsigned int dataoff,
1817                   u_int8_t protonum,
1818                   const struct nf_hook_state *state)
1819 {
1820         const struct nf_conntrack_zone *zone;
1821         struct nf_conntrack_tuple tuple;
1822         struct nf_conntrack_tuple_hash *h;
1823         enum ip_conntrack_info ctinfo;
1824         struct nf_conntrack_zone tmp;
1825         u32 hash, zone_id, rid;
1826         struct nf_conn *ct;
1827
1828         if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1829                              dataoff, state->pf, protonum, state->net,
1830                              &tuple)) {
1831                 pr_debug("Can't get tuple\n");
1832                 return 0;
1833         }
1834
1835         /* look for tuple match */
1836         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1837
1838         zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
1839         hash = hash_conntrack_raw(&tuple, zone_id, state->net);
1840         h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1841
1842         if (!h) {
1843                 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
1844                 if (zone_id != rid) {
1845                         u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);
1846
1847                         h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
1848                 }
1849         }
1850
1851         if (!h) {
1852                 h = init_conntrack(state->net, tmpl, &tuple,
1853                                    skb, dataoff, hash);
1854                 if (!h)
1855                         return 0;
1856                 if (IS_ERR(h))
1857                         return PTR_ERR(h);
1858         }
1859         ct = nf_ct_tuplehash_to_ctrack(h);
1860
1861         /* It exists; we have (non-exclusive) reference. */
1862         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1863                 ctinfo = IP_CT_ESTABLISHED_REPLY;
1864         } else {
1865                 /* Once we've had two way comms, always ESTABLISHED. */
1866                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1867                         pr_debug("normal packet for %p\n", ct);
1868                         ctinfo = IP_CT_ESTABLISHED;
1869                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1870                         pr_debug("related packet for %p\n", ct);
1871                         ctinfo = IP_CT_RELATED;
1872                 } else {
1873                         pr_debug("new packet for %p\n", ct);
1874                         ctinfo = IP_CT_NEW;
1875                 }
1876         }
1877         nf_ct_set(skb, ct, ctinfo);
1878         return 0;
1879 }
1880
1881 /*
1882  * icmp packets need special treatment to handle error messages that are
1883  * related to a connection.
1884  *
1885  * Callers need to check if skb has a conntrack assigned when this
1886  * helper returns; in such case skb belongs to an already known connection.
1887  */
1888 static unsigned int __cold
1889 nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1890                          struct sk_buff *skb,
1891                          unsigned int dataoff,
1892                          u8 protonum,
1893                          const struct nf_hook_state *state)
1894 {
1895         int ret;
1896
1897         if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1898                 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1899 #if IS_ENABLED(CONFIG_IPV6)
1900         else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1901                 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1902 #endif
1903         else
1904                 return NF_ACCEPT;
1905
1906         if (ret <= 0)
1907                 NF_CT_STAT_INC_ATOMIC(state->net, error);
1908
1909         return ret;
1910 }
1911
1912 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
1913                           enum ip_conntrack_info ctinfo)
1914 {
1915         const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1916
1917         if (!timeout)
1918                 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
1919
1920         nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
1921         return NF_ACCEPT;
1922 }
1923
1924 /* Returns verdict for packet, or -1 for invalid. */
1925 static int nf_conntrack_handle_packet(struct nf_conn *ct,
1926                                       struct sk_buff *skb,
1927                                       unsigned int dataoff,
1928                                       enum ip_conntrack_info ctinfo,
1929                                       const struct nf_hook_state *state)
1930 {
1931         switch (nf_ct_protonum(ct)) {
1932         case IPPROTO_TCP:
1933                 return nf_conntrack_tcp_packet(ct, skb, dataoff,
1934                                                ctinfo, state);
1935         case IPPROTO_UDP:
1936                 return nf_conntrack_udp_packet(ct, skb, dataoff,
1937                                                ctinfo, state);
1938         case IPPROTO_ICMP:
1939                 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1940 #if IS_ENABLED(CONFIG_IPV6)
1941         case IPPROTO_ICMPV6:
1942                 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1943 #endif
1944 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
1945         case IPPROTO_UDPLITE:
1946                 return nf_conntrack_udplite_packet(ct, skb, dataoff,
1947                                                    ctinfo, state);
1948 #endif
1949 #ifdef CONFIG_NF_CT_PROTO_SCTP
1950         case IPPROTO_SCTP:
1951                 return nf_conntrack_sctp_packet(ct, skb, dataoff,
1952                                                 ctinfo, state);
1953 #endif
1954 #ifdef CONFIG_NF_CT_PROTO_DCCP
1955         case IPPROTO_DCCP:
1956                 return nf_conntrack_dccp_packet(ct, skb, dataoff,
1957                                                 ctinfo, state);
1958 #endif
1959 #ifdef CONFIG_NF_CT_PROTO_GRE
1960         case IPPROTO_GRE:
1961                 return nf_conntrack_gre_packet(ct, skb, dataoff,
1962                                                ctinfo, state);
1963 #endif
1964         }
1965
1966         return generic_packet(ct, skb, ctinfo);
1967 }
1968
1969 unsigned int
1970 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1971 {
1972         enum ip_conntrack_info ctinfo;
1973         struct nf_conn *ct, *tmpl;
1974         u_int8_t protonum;
1975         int dataoff, ret;
1976
1977         tmpl = nf_ct_get(skb, &ctinfo);
1978         if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1979                 /* Previously seen (loopback or untracked)?  Ignore. */
1980                 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1981                      ctinfo == IP_CT_UNTRACKED)
1982                         return NF_ACCEPT;
1983                 skb->_nfct = 0;
1984         }
1985
1986         /* rcu_read_lock()ed by nf_hook_thresh */
1987         dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1988         if (dataoff <= 0) {
1989                 pr_debug("not prepared to track yet or error occurred\n");
1990                 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1991                 ret = NF_ACCEPT;
1992                 goto out;
1993         }
1994
1995         if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
1996                 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1997                                                protonum, state);
1998                 if (ret <= 0) {
1999                         ret = -ret;
2000                         goto out;
2001                 }
2002                 /* ICMP[v6] protocol trackers may assign one conntrack. */
2003                 if (skb->_nfct)
2004                         goto out;
2005         }
2006 repeat:
2007         ret = resolve_normal_ct(tmpl, skb, dataoff,
2008                                 protonum, state);
2009         if (ret < 0) {
2010                 /* Too stressed to deal. */
2011                 NF_CT_STAT_INC_ATOMIC(state->net, drop);
2012                 ret = NF_DROP;
2013                 goto out;
2014         }
2015
2016         ct = nf_ct_get(skb, &ctinfo);
2017         if (!ct) {
2018                 /* Not valid part of a connection */
2019                 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
2020                 ret = NF_ACCEPT;
2021                 goto out;
2022         }
2023
2024         ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
2025         if (ret <= 0) {
2026                 /* Invalid: inverse of the return code tells
2027                  * the netfilter core what to do */
2028                 pr_debug("nf_conntrack_in: Can't track with proto module\n");
2029                 nf_ct_put(ct);
2030                 skb->_nfct = 0;
2031                 /* Special case: TCP tracker reports an attempt to reopen a
2032                  * closed/aborted connection. We have to go back and create a
2033                  * fresh conntrack.
2034                  */
2035                 if (ret == -NF_REPEAT)
2036                         goto repeat;
2037
2038                 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
2039                 if (ret == -NF_DROP)
2040                         NF_CT_STAT_INC_ATOMIC(state->net, drop);
2041
2042                 ret = -ret;
2043                 goto out;
2044         }
2045
2046         if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
2047             !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
2048                 nf_conntrack_event_cache(IPCT_REPLY, ct);
2049 out:
2050         if (tmpl)
2051                 nf_ct_put(tmpl);
2052
2053         return ret;
2054 }
2055 EXPORT_SYMBOL_GPL(nf_conntrack_in);
2056
2057 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
2058    implicitly racy: see __nf_conntrack_confirm */
2059 void nf_conntrack_alter_reply(struct nf_conn *ct,
2060                               const struct nf_conntrack_tuple *newreply)
2061 {
2062         struct nf_conn_help *help = nfct_help(ct);
2063
2064         /* Should be unconfirmed, so not in hash table yet */
2065         WARN_ON(nf_ct_is_confirmed(ct));
2066
2067         pr_debug("Altering reply tuple of %p to ", ct);
2068         nf_ct_dump_tuple(newreply);
2069
2070         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
2071         if (ct->master || (help && !hlist_empty(&help->expectations)))
2072                 return;
2073
2074         rcu_read_lock();
2075         __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
2076         rcu_read_unlock();
2077 }
2078 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
2079
2080 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
2081 void __nf_ct_refresh_acct(struct nf_conn *ct,
2082                           enum ip_conntrack_info ctinfo,
2083                           const struct sk_buff *skb,
2084                           u32 extra_jiffies,
2085                           bool do_acct)
2086 {
2087         /* Only update if this is not a fixed timeout */
2088         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
2089                 goto acct;
2090
2091         /* If not in hash table, timer will not be active yet */
2092         if (nf_ct_is_confirmed(ct))
2093                 extra_jiffies += nfct_time_stamp;
2094
2095         if (READ_ONCE(ct->timeout) != extra_jiffies)
2096                 WRITE_ONCE(ct->timeout, extra_jiffies);
2097 acct:
2098         if (do_acct)
2099                 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
2100 }
2101 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
2102
2103 bool nf_ct_kill_acct(struct nf_conn *ct,
2104                      enum ip_conntrack_info ctinfo,
2105                      const struct sk_buff *skb)
2106 {
2107         nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
2108
2109         return nf_ct_delete(ct, 0, 0);
2110 }
2111 EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
2112
2113 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
2114
2115 #include <linux/netfilter/nfnetlink.h>
2116 #include <linux/netfilter/nfnetlink_conntrack.h>
2117 #include <linux/mutex.h>
2118
2119 /* Generic function for tcp/udp/sctp/dccp and alike. */
2120 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
2121                                const struct nf_conntrack_tuple *tuple)
2122 {
2123         if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
2124             nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
2125                 goto nla_put_failure;
2126         return 0;
2127
2128 nla_put_failure:
2129         return -1;
2130 }
2131 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
2132
2133 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
2134         [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
2135         [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
2136 };
2137 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
2138
2139 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
2140                                struct nf_conntrack_tuple *t,
2141                                u_int32_t flags)
2142 {
2143         if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
2144                 if (!tb[CTA_PROTO_SRC_PORT])
2145                         return -EINVAL;
2146
2147                 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
2148         }
2149
2150         if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
2151                 if (!tb[CTA_PROTO_DST_PORT])
2152                         return -EINVAL;
2153
2154                 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
2155         }
2156
2157         return 0;
2158 }
2159 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
2160
2161 unsigned int nf_ct_port_nlattr_tuple_size(void)
2162 {
2163         static unsigned int size __read_mostly;
2164
2165         if (!size)
2166                 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
2167
2168         return size;
2169 }
2170 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
2171 #endif
2172
2173 /* Used by ipt_REJECT and ip6t_REJECT. */
2174 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
2175 {
2176         struct nf_conn *ct;
2177         enum ip_conntrack_info ctinfo;
2178
2179         /* This ICMP is in reverse direction to the packet which caused it */
2180         ct = nf_ct_get(skb, &ctinfo);
2181         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
2182                 ctinfo = IP_CT_RELATED_REPLY;
2183         else
2184                 ctinfo = IP_CT_RELATED;
2185
2186         /* Attach to new skbuff, and increment count */
2187         nf_ct_set(nskb, ct, ctinfo);
2188         nf_conntrack_get(skb_nfct(nskb));
2189 }
2190
2191 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
2192                                  struct nf_conn *ct,
2193                                  enum ip_conntrack_info ctinfo)
2194 {
2195         const struct nf_nat_hook *nat_hook;
2196         struct nf_conntrack_tuple_hash *h;
2197         struct nf_conntrack_tuple tuple;
2198         unsigned int status;
2199         int dataoff;
2200         u16 l3num;
2201         u8 l4num;
2202
2203         l3num = nf_ct_l3num(ct);
2204
2205         dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
2206         if (dataoff <= 0)
2207                 return -1;
2208
2209         if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
2210                              l4num, net, &tuple))
2211                 return -1;
2212
2213         if (ct->status & IPS_SRC_NAT) {
2214                 memcpy(tuple.src.u3.all,
2215                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2216                        sizeof(tuple.src.u3.all));
2217                 tuple.src.u.all =
2218                         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2219         }
2220
2221         if (ct->status & IPS_DST_NAT) {
2222                 memcpy(tuple.dst.u3.all,
2223                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2224                        sizeof(tuple.dst.u3.all));
2225                 tuple.dst.u.all =
2226                         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2227         }
2228
2229         h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2230         if (!h)
2231                 return 0;
2232
2233         /* Store status bits of the conntrack that is clashing to re-do NAT
2234          * mangling according to what it has been done already to this packet.
2235          */
2236         status = ct->status;
2237
2238         nf_ct_put(ct);
2239         ct = nf_ct_tuplehash_to_ctrack(h);
2240         nf_ct_set(skb, ct, ctinfo);
2241
2242         nat_hook = rcu_dereference(nf_nat_hook);
2243         if (!nat_hook)
2244                 return 0;
2245
2246         if (status & IPS_SRC_NAT &&
2247             nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
2248                                 IP_CT_DIR_ORIGINAL) == NF_DROP)
2249                 return -1;
2250
2251         if (status & IPS_DST_NAT &&
2252             nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
2253                                 IP_CT_DIR_ORIGINAL) == NF_DROP)
2254                 return -1;
2255
2256         return 0;
2257 }
2258
2259 /* This packet is coming from userspace via nf_queue, complete the packet
2260  * processing after the helper invocation in nf_confirm().
2261  */
2262 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
2263                                enum ip_conntrack_info ctinfo)
2264 {
2265         const struct nf_conntrack_helper *helper;
2266         const struct nf_conn_help *help;
2267         int protoff;
2268
2269         help = nfct_help(ct);
2270         if (!help)
2271                 return 0;
2272
2273         helper = rcu_dereference(help->helper);
2274         if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
2275                 return 0;
2276
2277         switch (nf_ct_l3num(ct)) {
2278         case NFPROTO_IPV4:
2279                 protoff = skb_network_offset(skb) + ip_hdrlen(skb);
2280                 break;
2281 #if IS_ENABLED(CONFIG_IPV6)
2282         case NFPROTO_IPV6: {
2283                 __be16 frag_off;
2284                 u8 pnum;
2285
2286                 pnum = ipv6_hdr(skb)->nexthdr;
2287                 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
2288                                            &frag_off);
2289                 if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
2290                         return 0;
2291                 break;
2292         }
2293 #endif
2294         default:
2295                 return 0;
2296         }
2297
2298         if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
2299             !nf_is_loopback_packet(skb)) {
2300                 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
2301                         NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
2302                         return -1;
2303                 }
2304         }
2305
2306         /* We've seen it coming out the other side: confirm it */
2307         return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
2308 }
2309
2310 static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
2311 {
2312         enum ip_conntrack_info ctinfo;
2313         struct nf_conn *ct;
2314         int err;
2315
2316         ct = nf_ct_get(skb, &ctinfo);
2317         if (!ct)
2318                 return 0;
2319
2320         if (!nf_ct_is_confirmed(ct)) {
2321                 err = __nf_conntrack_update(net, skb, ct, ctinfo);
2322                 if (err < 0)
2323                         return err;
2324
2325                 ct = nf_ct_get(skb, &ctinfo);
2326         }
2327
2328         return nf_confirm_cthelper(skb, ct, ctinfo);
2329 }
2330
2331 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
2332                                        const struct sk_buff *skb)
2333 {
2334         const struct nf_conntrack_tuple *src_tuple;
2335         const struct nf_conntrack_tuple_hash *hash;
2336         struct nf_conntrack_tuple srctuple;
2337         enum ip_conntrack_info ctinfo;
2338         struct nf_conn *ct;
2339
2340         ct = nf_ct_get(skb, &ctinfo);
2341         if (ct) {
2342                 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
2343                 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2344                 return true;
2345         }
2346
2347         if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
2348                                NFPROTO_IPV4, dev_net(skb->dev),
2349                                &srctuple))
2350                 return false;
2351
2352         hash = nf_conntrack_find_get(dev_net(skb->dev),
2353                                      &nf_ct_zone_dflt,
2354                                      &srctuple);
2355         if (!hash)
2356                 return false;
2357
2358         ct = nf_ct_tuplehash_to_ctrack(hash);
2359         src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
2360         memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2361         nf_ct_put(ct);
2362
2363         return true;
2364 }
2365
2366 /* Bring out ya dead! */
2367 static struct nf_conn *
2368 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
2369                 const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
2370 {
2371         struct nf_conntrack_tuple_hash *h;
2372         struct nf_conn *ct;
2373         struct hlist_nulls_node *n;
2374         spinlock_t *lockp;
2375
2376         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2377                 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
2378
2379                 if (hlist_nulls_empty(hslot))
2380                         continue;
2381
2382                 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2383                 local_bh_disable();
2384                 nf_conntrack_lock(lockp);
2385                 hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
2386                         if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2387                                 continue;
2388                         /* All nf_conn objects are added to hash table twice, one
2389                          * for original direction tuple, once for the reply tuple.
2390                          *
2391                          * Exception: In the IPS_NAT_CLASH case, only the reply
2392                          * tuple is added (the original tuple already existed for
2393                          * a different object).
2394                          *
2395                          * We only need to call the iterator once for each
2396                          * conntrack, so we just use the 'reply' direction
2397                          * tuple while iterating.
2398                          */
2399                         ct = nf_ct_tuplehash_to_ctrack(h);
2400
2401                         if (iter_data->net &&
2402                             !net_eq(iter_data->net, nf_ct_net(ct)))
2403                                 continue;
2404
2405                         if (iter(ct, iter_data->data))
2406                                 goto found;
2407                 }
2408                 spin_unlock(lockp);
2409                 local_bh_enable();
2410                 cond_resched();
2411         }
2412
2413         return NULL;
2414 found:
2415         refcount_inc(&ct->ct_general.use);
2416         spin_unlock(lockp);
2417         local_bh_enable();
2418         return ct;
2419 }
2420
2421 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
2422                                   const struct nf_ct_iter_data *iter_data)
2423 {
2424         unsigned int bucket = 0;
2425         struct nf_conn *ct;
2426
2427         might_sleep();
2428
2429         mutex_lock(&nf_conntrack_mutex);
2430         while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
2431                 /* Time to push up daises... */
2432
2433                 nf_ct_delete(ct, iter_data->portid, iter_data->report);
2434                 nf_ct_put(ct);
2435                 cond_resched();
2436         }
2437         mutex_unlock(&nf_conntrack_mutex);
2438 }
2439
2440 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
2441                                const struct nf_ct_iter_data *iter_data)
2442 {
2443         struct net *net = iter_data->net;
2444         struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2445
2446         might_sleep();
2447
2448         if (atomic_read(&cnet->count) == 0)
2449                 return;
2450
2451         nf_ct_iterate_cleanup(iter, iter_data);
2452 }
2453 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2454
2455 /**
2456  * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
2457  * @iter: callback to invoke for each conntrack
2458  * @data: data to pass to @iter
2459  *
2460  * Like nf_ct_iterate_cleanup, but first marks conntracks on the
2461  * unconfirmed list as dying (so they will not be inserted into
2462  * main table).
2463  *
2464  * Can only be called in module exit path.
2465  */
2466 void
2467 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
2468 {
2469         struct nf_ct_iter_data iter_data = {};
2470         struct net *net;
2471
2472         down_read(&net_rwsem);
2473         for_each_net(net) {
2474                 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2475
2476                 if (atomic_read(&cnet->count) == 0)
2477                         continue;
2478                 nf_queue_nf_hook_drop(net);
2479         }
2480         up_read(&net_rwsem);
2481
2482         /* Need to wait for netns cleanup worker to finish, if its
2483          * running -- it might have deleted a net namespace from
2484          * the global list, so hook drop above might not have
2485          * affected all namespaces.
2486          */
2487         net_ns_barrier();
2488
2489         /* a skb w. unconfirmed conntrack could have been reinjected just
2490          * before we called nf_queue_nf_hook_drop().
2491          *
2492          * This makes sure its inserted into conntrack table.
2493          */
2494         synchronize_net();
2495
2496         nf_ct_ext_bump_genid();
2497         iter_data.data = data;
2498         nf_ct_iterate_cleanup(iter, &iter_data);
2499
2500         /* Another cpu might be in a rcu read section with
2501          * rcu protected pointer cleared in iter callback
2502          * or hidden via nf_ct_ext_bump_genid() above.
2503          *
2504          * Wait until those are done.
2505          */
2506         synchronize_rcu();
2507 }
2508 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2509
2510 static int kill_all(struct nf_conn *i, void *data)
2511 {
2512         return 1;
2513 }
2514
2515 void nf_conntrack_cleanup_start(void)
2516 {
2517         conntrack_gc_work.exiting = true;
2518 }
2519
2520 void nf_conntrack_cleanup_end(void)
2521 {
2522         RCU_INIT_POINTER(nf_ct_hook, NULL);
2523         cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2524         kvfree(nf_conntrack_hash);
2525
2526         nf_conntrack_proto_fini();
2527         nf_conntrack_helper_fini();
2528         nf_conntrack_expect_fini();
2529
2530         kmem_cache_destroy(nf_conntrack_cachep);
2531 }
2532
2533 /*
2534  * Mishearing the voices in his head, our hero wonders how he's
2535  * supposed to kill the mall.
2536  */
2537 void nf_conntrack_cleanup_net(struct net *net)
2538 {
2539         LIST_HEAD(single);
2540
2541         list_add(&net->exit_list, &single);
2542         nf_conntrack_cleanup_net_list(&single);
2543 }
2544
2545 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2546 {
2547         struct nf_ct_iter_data iter_data = {};
2548         struct net *net;
2549         int busy;
2550
2551         /*
2552          * This makes sure all current packets have passed through
2553          *  netfilter framework.  Roll on, two-stage module
2554          *  delete...
2555          */
2556         synchronize_net();
2557 i_see_dead_people:
2558         busy = 0;
2559         list_for_each_entry(net, net_exit_list, exit_list) {
2560                 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2561
2562                 iter_data.net = net;
2563                 nf_ct_iterate_cleanup_net(kill_all, &iter_data);
2564                 if (atomic_read(&cnet->count) != 0)
2565                         busy = 1;
2566         }
2567         if (busy) {
2568                 schedule();
2569                 goto i_see_dead_people;
2570         }
2571
2572         list_for_each_entry(net, net_exit_list, exit_list) {
2573                 nf_conntrack_ecache_pernet_fini(net);
2574                 nf_conntrack_expect_pernet_fini(net);
2575                 free_percpu(net->ct.stat);
2576         }
2577 }
2578
2579 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
2580 {
2581         struct hlist_nulls_head *hash;
2582         unsigned int nr_slots, i;
2583
2584         if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
2585                 return NULL;
2586
2587         BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2588         nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2589
2590         hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
2591
2592         if (hash && nulls)
2593                 for (i = 0; i < nr_slots; i++)
2594                         INIT_HLIST_NULLS_HEAD(&hash[i], i);
2595
2596         return hash;
2597 }
2598 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2599
2600 int nf_conntrack_hash_resize(unsigned int hashsize)
2601 {
2602         int i, bucket;
2603         unsigned int old_size;
2604         struct hlist_nulls_head *hash, *old_hash;
2605         struct nf_conntrack_tuple_hash *h;
2606         struct nf_conn *ct;
2607
2608         if (!hashsize)
2609                 return -EINVAL;
2610
2611         hash = nf_ct_alloc_hashtable(&hashsize, 1);
2612         if (!hash)
2613                 return -ENOMEM;
2614
2615         mutex_lock(&nf_conntrack_mutex);
2616         old_size = nf_conntrack_htable_size;
2617         if (old_size == hashsize) {
2618                 mutex_unlock(&nf_conntrack_mutex);
2619                 kvfree(hash);
2620                 return 0;
2621         }
2622
2623         local_bh_disable();
2624         nf_conntrack_all_lock();
2625         write_seqcount_begin(&nf_conntrack_generation);
2626
2627         /* Lookups in the old hash might happen in parallel, which means we
2628          * might get false negatives during connection lookup. New connections
2629          * created because of a false negative won't make it into the hash
2630          * though since that required taking the locks.
2631          */
2632
2633         for (i = 0; i < nf_conntrack_htable_size; i++) {
2634                 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
2635                         unsigned int zone_id;
2636
2637                         h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2638                                               struct nf_conntrack_tuple_hash, hnnode);
2639                         ct = nf_ct_tuplehash_to_ctrack(h);
2640                         hlist_nulls_del_rcu(&h->hnnode);
2641
2642                         zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
2643                         bucket = __hash_conntrack(nf_ct_net(ct),
2644                                                   &h->tuple, zone_id, hashsize);
2645                         hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
2646                 }
2647         }
2648         old_hash = nf_conntrack_hash;
2649
2650         nf_conntrack_hash = hash;
2651         nf_conntrack_htable_size = hashsize;
2652
2653         write_seqcount_end(&nf_conntrack_generation);
2654         nf_conntrack_all_unlock();
2655         local_bh_enable();
2656
2657         mutex_unlock(&nf_conntrack_mutex);
2658
2659         synchronize_net();
2660         kvfree(old_hash);
2661         return 0;
2662 }
2663
2664 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2665 {
2666         unsigned int hashsize;
2667         int rc;
2668
2669         if (current->nsproxy->net_ns != &init_net)
2670                 return -EOPNOTSUPP;
2671
2672         /* On boot, we can set this without any fancy locking. */
2673         if (!nf_conntrack_hash)
2674                 return param_set_uint(val, kp);
2675
2676         rc = kstrtouint(val, 0, &hashsize);
2677         if (rc)
2678                 return rc;
2679
2680         return nf_conntrack_hash_resize(hashsize);
2681 }
2682
2683 int nf_conntrack_init_start(void)
2684 {
2685         unsigned long nr_pages = totalram_pages();
2686         int max_factor = 8;
2687         int ret = -ENOMEM;
2688         int i;
2689
2690         seqcount_spinlock_init(&nf_conntrack_generation,
2691                                &nf_conntrack_locks_all_lock);
2692
2693         for (i = 0; i < CONNTRACK_LOCKS; i++)
2694                 spin_lock_init(&nf_conntrack_locks[i]);
2695
2696         if (!nf_conntrack_htable_size) {
2697                 nf_conntrack_htable_size
2698                         = (((nr_pages << PAGE_SHIFT) / 16384)
2699                            / sizeof(struct hlist_head));
2700                 if (BITS_PER_LONG >= 64 &&
2701                     nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2702                         nf_conntrack_htable_size = 262144;
2703                 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2704                         nf_conntrack_htable_size = 65536;
2705
2706                 if (nf_conntrack_htable_size < 1024)
2707                         nf_conntrack_htable_size = 1024;
2708                 /* Use a max. factor of one by default to keep the average
2709                  * hash chain length at 2 entries.  Each entry has to be added
2710                  * twice (once for original direction, once for reply).
2711                  * When a table size is given we use the old value of 8 to
2712                  * avoid implicit reduction of the max entries setting.
2713                  */
2714                 max_factor = 1;
2715         }
2716
2717         nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2718         if (!nf_conntrack_hash)
2719                 return -ENOMEM;
2720
2721         nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2722
2723         nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2724                                                 sizeof(struct nf_conn),
2725                                                 NFCT_INFOMASK + 1,
2726                                                 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2727         if (!nf_conntrack_cachep)
2728                 goto err_cachep;
2729
2730         ret = nf_conntrack_expect_init();
2731         if (ret < 0)
2732                 goto err_expect;
2733
2734         ret = nf_conntrack_helper_init();
2735         if (ret < 0)
2736                 goto err_helper;
2737
2738         ret = nf_conntrack_proto_init();
2739         if (ret < 0)
2740                 goto err_proto;
2741
2742         conntrack_gc_work_init(&conntrack_gc_work);
2743         queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2744
2745         ret = register_nf_conntrack_bpf();
2746         if (ret < 0)
2747                 goto err_kfunc;
2748
2749         return 0;
2750
2751 err_kfunc:
2752         cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2753         nf_conntrack_proto_fini();
2754 err_proto:
2755         nf_conntrack_helper_fini();
2756 err_helper:
2757         nf_conntrack_expect_fini();
2758 err_expect:
2759         kmem_cache_destroy(nf_conntrack_cachep);
2760 err_cachep:
2761         kvfree(nf_conntrack_hash);
2762         return ret;
2763 }
2764
2765 static const struct nf_ct_hook nf_conntrack_hook = {
2766         .update         = nf_conntrack_update,
2767         .destroy        = nf_ct_destroy,
2768         .get_tuple_skb  = nf_conntrack_get_tuple_skb,
2769         .attach         = nf_conntrack_attach,
2770 };
2771
2772 void nf_conntrack_init_end(void)
2773 {
2774         RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2775 }
2776
2777 /*
2778  * We need to use special "null" values, not used in hash table
2779  */
2780 #define UNCONFIRMED_NULLS_VAL   ((1<<30)+0)
2781
2782 int nf_conntrack_init_net(struct net *net)
2783 {
2784         struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2785         int ret = -ENOMEM;
2786
2787         BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2788         BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2789         atomic_set(&cnet->count, 0);
2790
2791         net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2792         if (!net->ct.stat)
2793                 return ret;
2794
2795         ret = nf_conntrack_expect_pernet_init(net);
2796         if (ret < 0)
2797                 goto err_expect;
2798
2799         nf_conntrack_acct_pernet_init(net);
2800         nf_conntrack_tstamp_pernet_init(net);
2801         nf_conntrack_ecache_pernet_init(net);
2802         nf_conntrack_helper_pernet_init(net);
2803         nf_conntrack_proto_pernet_init(net);
2804
2805         return 0;
2806
2807 err_expect:
2808         free_percpu(net->ct.stat);
2809         return ret;
2810 }