netfilter: nf_tables: add destroy_clone expression
[linux-2.6-microblaze.git] / net / netfilter / nf_tables_api.c
index a5f3743..0a6eafa 100644 (file)
@@ -28,6 +28,28 @@ static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
 static u64 table_handle;
 
+enum {
+       NFT_VALIDATE_SKIP       = 0,
+       NFT_VALIDATE_NEED,
+       NFT_VALIDATE_DO,
+};
+
+static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+{
+       switch (net->nft.validate_state) {
+       case NFT_VALIDATE_SKIP:
+               WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
+               break;
+       case NFT_VALIDATE_NEED:
+               break;
+       case NFT_VALIDATE_DO:
+               if (new_validate_state == NFT_VALIDATE_NEED)
+                       return;
+       }
+
+       net->nft.validate_state = new_validate_state;
+}
+
 static void nft_ctx_init(struct nft_ctx *ctx,
                         struct net *net,
                         const struct sk_buff *skb,
@@ -74,88 +96,43 @@ static void nft_trans_destroy(struct nft_trans *trans)
        kfree(trans);
 }
 
-/* removal requests are queued in the commit_list, but not acted upon
- * until after all new rules are in place.
- *
- * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending
- * nf_unregister_net_hook().
- *
- * nf_register_net_hook thus fails if a nat hook is already in place
- * even if the conflicting hook is about to be removed.
- *
- * If collision is detected, search commit_log for DELCHAIN matching
- * the new nat hooknum; if we find one collision is temporary:
- *
- * Either transaction is aborted (new/colliding hook is removed), or
- * transaction is committed (old hook is removed).
- */
-static bool nf_tables_allow_nat_conflict(const struct net *net,
-                                        const struct nf_hook_ops *ops)
-{
-       const struct nft_trans *trans;
-       bool ret = false;
-
-       if (!ops->nat_hook)
-               return false;
-
-       list_for_each_entry(trans, &net->nft.commit_list, list) {
-               const struct nf_hook_ops *pending_ops;
-               const struct nft_chain *pending;
-
-               if (trans->msg_type != NFT_MSG_NEWCHAIN &&
-                   trans->msg_type != NFT_MSG_DELCHAIN)
-                       continue;
-
-               pending = trans->ctx.chain;
-               if (!nft_is_base_chain(pending))
-                       continue;
-
-               pending_ops = &nft_base_chain(pending)->ops;
-               if (pending_ops->nat_hook &&
-                   pending_ops->pf == ops->pf &&
-                   pending_ops->hooknum == ops->hooknum) {
-                       /* other hook registration already pending? */
-                       if (trans->msg_type == NFT_MSG_NEWCHAIN)
-                               return false;
-
-                       ret = true;
-               }
-       }
-
-       return ret;
-}
-
 static int nf_tables_register_hook(struct net *net,
                                   const struct nft_table *table,
                                   struct nft_chain *chain)
 {
-       struct nf_hook_ops *ops;
-       int ret;
+       const struct nft_base_chain *basechain;
+       const struct nf_hook_ops *ops;
 
        if (table->flags & NFT_TABLE_F_DORMANT ||
            !nft_is_base_chain(chain))
                return 0;
 
-       ops = &nft_base_chain(chain)->ops;
-       ret = nf_register_net_hook(net, ops);
-       if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
-               ops->nat_hook = false;
-               ret = nf_register_net_hook(net, ops);
-               ops->nat_hook = true;
-       }
+       basechain = nft_base_chain(chain);
+       ops = &basechain->ops;
 
-       return ret;
+       if (basechain->type->ops_register)
+               return basechain->type->ops_register(net, ops);
+
+       return nf_register_net_hook(net, ops);
 }
 
 static void nf_tables_unregister_hook(struct net *net,
                                      const struct nft_table *table,
                                      struct nft_chain *chain)
 {
+       const struct nft_base_chain *basechain;
+       const struct nf_hook_ops *ops;
+
        if (table->flags & NFT_TABLE_F_DORMANT ||
            !nft_is_base_chain(chain))
                return;
+       basechain = nft_base_chain(chain);
+       ops = &basechain->ops;
 
-       nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+       if (basechain->type->ops_unregister)
+               return basechain->type->ops_unregister(net, ops);
+
+       nf_unregister_net_hook(net, ops);
 }
 
 static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -418,7 +395,7 @@ static struct nft_table *nft_table_lookup(const struct net *net,
        if (nla == NULL)
                return ERR_PTR(-EINVAL);
 
-       list_for_each_entry(table, &net->nft.tables, list) {
+       list_for_each_entry_rcu(table, &net->nft.tables, list) {
                if (!nla_strcmp(nla, table->name) &&
                    table->family == family &&
                    nft_active_genmask(table, genmask))
@@ -591,6 +568,24 @@ done:
        return skb->len;
 }
 
+static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+                                     const struct nlmsghdr *nlh,
+                                     struct netlink_dump_control *c)
+{
+       int err;
+
+       if (!try_module_get(THIS_MODULE))
+               return -EINVAL;
+
+       rcu_read_unlock();
+       err = netlink_dump_start(nlsk, skb, nlh, c);
+       rcu_read_lock();
+       module_put(THIS_MODULE);
+
+       return err;
+}
+
+/* called with rcu_read_lock held */
 static int nf_tables_gettable(struct net *net, struct sock *nlsk,
                              struct sk_buff *skb, const struct nlmsghdr *nlh,
                              const struct nlattr * const nla[],
@@ -606,8 +601,10 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
        if (nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_tables,
+                       .module = THIS_MODULE,
                };
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
@@ -616,7 +613,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
                return PTR_ERR(table);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -978,7 +975,7 @@ static struct nft_chain *nft_chain_lookup(const struct nft_table *table,
        if (nla == NULL)
                return ERR_PTR(-EINVAL);
 
-       list_for_each_entry(chain, &table->chains, list) {
+       list_for_each_entry_rcu(chain, &table->chains, list) {
                if (!nla_strcmp(nla, chain->name) &&
                    nft_active_genmask(chain, genmask))
                        return chain;
@@ -1180,6 +1177,7 @@ done:
        return skb->len;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getchain(struct net *net, struct sock *nlsk,
                              struct sk_buff *skb, const struct nlmsghdr *nlh,
                              const struct nlattr * const nla[],
@@ -1196,8 +1194,10 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
        if (nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_chains,
+                       .module = THIS_MODULE,
                };
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
@@ -1212,7 +1212,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
                return PTR_ERR(chain);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -1282,17 +1282,32 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
                rcu_assign_pointer(chain->stats, newstats);
 }
 
+static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
+{
+       struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
+       struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+
+       if (g0 != g1)
+               kvfree(g1);
+       kvfree(g0);
+
+       /* should be NULL either via abort or via successful commit */
+       WARN_ON_ONCE(chain->rules_next);
+       kvfree(chain->rules_next);
+}
+
 static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 {
        struct nft_chain *chain = ctx->chain;
 
        BUG_ON(chain->use > 0);
 
+       /* no concurrent access possible anymore */
+       nf_tables_chain_free_chain_rules(chain);
+
        if (nft_is_base_chain(chain)) {
                struct nft_base_chain *basechain = nft_base_chain(chain);
 
-               if (basechain->type->free)
-                       basechain->type->free(ctx);
                module_put(basechain->type->owner);
                free_percpu(basechain->stats);
                if (basechain->stats)
@@ -1382,6 +1397,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
        module_put(hook->type->owner);
 }
 
+struct nft_rules_old {
+       struct rcu_head h;
+       struct nft_rule **start;
+};
+
+static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+                                                    unsigned int alloc)
+{
+       if (alloc > INT_MAX)
+               return NULL;
+
+       alloc += 1;     /* NULL, ends rules */
+       if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+               return NULL;
+
+       alloc *= sizeof(struct nft_rule *);
+       alloc += sizeof(struct nft_rules_old);
+
+       return kvmalloc(alloc, GFP_KERNEL);
+}
+
 static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
                              u8 policy, bool create)
 {
@@ -1391,6 +1427,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
        struct nft_stats __percpu *stats;
        struct net *net = ctx->net;
        struct nft_chain *chain;
+       struct nft_rule **rules;
        int err;
 
        if (table->use == UINT_MAX)
@@ -1425,9 +1462,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
                }
 
                basechain->type = hook.type;
-               if (basechain->type->init)
-                       basechain->type->init(ctx);
-
                chain = &basechain->chain;
 
                ops             = &basechain->ops;
@@ -1438,9 +1472,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
                ops->hook       = hook.type->hooks[ops->hooknum];
                ops->dev        = hook.dev;
 
-               if (basechain->type->type == NFT_CHAIN_T_NAT)
-                       ops->nat_hook = true;
-
                chain->flags |= NFT_BASE_CHAIN;
                basechain->policy = policy;
        } else {
@@ -1459,6 +1490,16 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
                goto err1;
        }
 
+       rules = nf_tables_chain_alloc_rules(chain, 0);
+       if (!rules) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       *rules = NULL;
+       rcu_assign_pointer(chain->rules_gen_0, rules);
+       rcu_assign_pointer(chain->rules_gen_1, rules);
+
        err = nf_tables_register_hook(net, table, chain);
        if (err < 0)
                goto err1;
@@ -1902,19 +1943,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
                        goto err1;
        }
 
-       if (ops->validate) {
-               const struct nft_data *data = NULL;
-
-               err = ops->validate(ctx, expr, &data);
-               if (err < 0)
-                       goto err2;
-       }
-
        return 0;
-
-err2:
-       if (ops->destroy)
-               ops->destroy(ctx, expr);
 err1:
        expr->ops = NULL;
        return err;
@@ -1973,7 +2002,7 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
        struct nft_rule *rule;
 
        // FIXME: this sucks
-       list_for_each_entry(rule, &chain->rules, list) {
+       list_for_each_entry_rcu(rule, &chain->rules, list) {
                if (handle == rule->handle)
                        return rule;
        }
@@ -2169,6 +2198,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
        return 0;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                             struct sk_buff *skb, const struct nlmsghdr *nlh,
                             const struct nlattr * const nla[],
@@ -2187,18 +2217,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_rules,
                        .done = nf_tables_dump_rules_done,
+                       .module = THIS_MODULE,
                };
 
                if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
                        struct nft_rule_dump_ctx *ctx;
 
-                       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+                       ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
                        if (!ctx)
                                return -ENOMEM;
 
                        if (nla[NFTA_RULE_TABLE]) {
                                ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
-                                                       GFP_KERNEL);
+                                                       GFP_ATOMIC);
                                if (!ctx->table) {
                                        kfree(ctx);
                                        return -ENOMEM;
@@ -2206,7 +2237,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                        }
                        if (nla[NFTA_RULE_CHAIN]) {
                                ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
-                                                       GFP_KERNEL);
+                                                       GFP_ATOMIC);
                                if (!ctx->chain) {
                                        kfree(ctx->table);
                                        kfree(ctx);
@@ -2216,7 +2247,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                        c.data = ctx;
                }
 
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
@@ -2237,7 +2268,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                return PTR_ERR(rule);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -2278,6 +2309,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
        nf_tables_rule_destroy(ctx, rule);
 }
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+{
+       struct nft_expr *expr, *last;
+       const struct nft_data *data;
+       struct nft_rule *rule;
+       int err;
+
+       list_for_each_entry(rule, &chain->rules, list) {
+               if (!nft_is_active_next(ctx->net, rule))
+                       continue;
+
+               nft_rule_for_each_expr(expr, last, rule) {
+                       if (!expr->ops->validate)
+                               continue;
+
+                       err = expr->ops->validate(ctx, expr, &data);
+                       if (err < 0)
+                               return err;
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate);
+
+static int nft_table_validate(struct net *net, const struct nft_table *table)
+{
+       struct nft_chain *chain;
+       struct nft_ctx ctx = {
+               .net    = net,
+               .family = table->family,
+       };
+       int err;
+
+       list_for_each_entry(chain, &table->chains, list) {
+               if (!nft_is_base_chain(chain))
+                       continue;
+
+               ctx.chain = chain;
+               err = nft_chain_validate(&ctx, chain);
+               if (err < 0)
+                       return err;
+       }
+
+       return 0;
+}
+
 #define NFT_RULE_MAXEXPRS      128
 
 static struct nft_expr_info *info;
@@ -2405,6 +2483,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
                err = nf_tables_newexpr(&ctx, &info[i], expr);
                if (err < 0)
                        goto err2;
+
+               if (info[i].ops->validate)
+                       nft_validate_state_update(net, NFT_VALIDATE_NEED);
+
                info[i].ops = NULL;
                expr = nft_expr_next(expr);
        }
@@ -2448,8 +2530,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
                }
        }
        chain->use++;
-       return 0;
 
+       if (net->nft.validate_state == NFT_VALIDATE_DO)
+               return nft_table_validate(net, table);
+
+       return 0;
 err2:
        nf_tables_rule_release(&ctx, rule);
 err1:
@@ -2708,7 +2793,7 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table,
        if (nla == NULL)
                return ERR_PTR(-EINVAL);
 
-       list_for_each_entry(set, &table->sets, list) {
+       list_for_each_entry_rcu(set, &table->sets, list) {
                if (!nla_strcmp(nla, set->name) &&
                    nft_active_genmask(set, genmask))
                        return set;
@@ -2834,7 +2919,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
        return 0;
 }
 
-static u64 nf_jiffies64_to_msecs(u64 input)
+static __be64 nf_jiffies64_to_msecs(u64 input)
 {
        u64 ms = jiffies64_to_nsecs(input);
 
@@ -3013,6 +3098,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
        return 0;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getset(struct net *net, struct sock *nlsk,
                            struct sk_buff *skb, const struct nlmsghdr *nlh,
                            const struct nlattr * const nla[],
@@ -3035,17 +3121,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_sets,
                        .done = nf_tables_dump_sets_done,
+                       .module = THIS_MODULE,
                };
                struct nft_ctx *ctx_dump;
 
-               ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL);
+               ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
                if (ctx_dump == NULL)
                        return -ENOMEM;
 
                *ctx_dump = ctx;
                c.data = ctx_dump;
 
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        /* Only accept unspec with dump */
@@ -3058,7 +3145,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
        if (IS_ERR(set))
                return PTR_ERR(set);
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;
 
@@ -3272,6 +3359,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
        }
 
        INIT_LIST_HEAD(&set->bindings);
+       set->table = table;
+       write_pnet(&set->net, net);
        set->ops   = ops;
        set->ktype = ktype;
        set->klen  = desc.klen;
@@ -3799,7 +3888,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
        ext = nft_set_elem_ext(set, &elem);
 
        err = -ENOMEM;
-       skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb == NULL)
                goto err1;
 
@@ -3821,6 +3910,7 @@ err1:
        return err == -EAGAIN ? -ENOBUFS : err;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
                                struct sk_buff *skb, const struct nlmsghdr *nlh,
                                const struct nlattr * const nla[],
@@ -3845,10 +3935,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_set,
                        .done = nf_tables_dump_set_done,
+                       .module = THIS_MODULE,
                };
                struct nft_set_dump_ctx *dump_ctx;
 
-               dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+               dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
                if (!dump_ctx)
                        return -ENOMEM;
 
@@ -3856,7 +3947,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
                dump_ctx->ctx = ctx;
 
                c.data = dump_ctx;
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -3947,12 +4038,24 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
                          bool destroy_expr)
 {
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+       struct nft_ctx ctx = {
+               .net    = read_pnet(&set->net),
+               .family = set->table->family,
+       };
 
        nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
                nft_data_release(nft_set_ext_data(ext), set->dtype);
-       if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-               nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+       if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
+               struct nft_expr *expr = nft_set_ext_expr(ext);
+
+               if (expr->ops->destroy_clone) {
+                       expr->ops->destroy_clone(&ctx, expr);
+                       module_put(expr->ops->type->owner);
+               } else {
+                       nf_tables_expr_destroy(&ctx, expr);
+               }
+       }
        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
                (*nft_set_ext_obj(ext))->use--;
        kfree(elem);
@@ -3962,12 +4065,13 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
 /* Only called from commit path, nft_set_elem_deactivate() already deals with
  * the refcounting from the preparation phase.
  */
-static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
+static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+                                      const struct nft_set *set, void *elem)
 {
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
 
        if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-               nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+               nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
        kfree(elem);
 }
 
@@ -4087,6 +4191,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
                                                          d2.type, d2.len);
                        if (err < 0)
                                goto err3;
+
+                       if (d2.type == NFT_DATA_VERDICT &&
+                           (data.verdict.code == NFT_GOTO ||
+                            data.verdict.code == NFT_JUMP))
+                               nft_validate_state_update(ctx->net,
+                                                         NFT_VALIDATE_NEED);
                }
 
                nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4186,7 +4296,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
        const struct nlattr *attr;
        struct nft_set *set;
        struct nft_ctx ctx;
-       int rem, err = 0;
+       int rem, err;
 
        if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
                return -EINVAL;
@@ -4207,9 +4317,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
                if (err < 0)
-                       break;
+                       return err;
        }
-       return err;
+
+       if (net->nft.validate_state == NFT_VALIDATE_DO)
+               return nft_table_validate(net, ctx.table);
+
+       return 0;
 }
 
 /**
@@ -4479,7 +4593,7 @@ struct nft_object *nft_obj_lookup(const struct nft_table *table,
 {
        struct nft_object *obj;
 
-       list_for_each_entry(obj, &table->objects, list) {
+       list_for_each_entry_rcu(obj, &table->objects, list) {
                if (!nla_strcmp(nla, obj->name) &&
                    objtype == obj->ops->type->type &&
                    nft_active_genmask(obj, genmask))
@@ -4688,7 +4802,7 @@ err3:
        kfree(obj->name);
 err2:
        if (obj->ops->destroy)
-               obj->ops->destroy(obj);
+               obj->ops->destroy(&ctx, obj);
        kfree(obj);
 err1:
        module_put(type->owner);
@@ -4809,12 +4923,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
 {
        struct nft_obj_filter *filter;
 
-       filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+       filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
        if (!filter)
                return ERR_PTR(-ENOMEM);
 
        if (nla[NFTA_OBJ_TABLE]) {
-               filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL);
+               filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
                if (!filter->table) {
                        kfree(filter);
                        return ERR_PTR(-ENOMEM);
@@ -4826,6 +4940,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
        return filter;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getobj(struct net *net, struct sock *nlsk,
                            struct sk_buff *skb, const struct nlmsghdr *nlh,
                            const struct nlattr * const nla[],
@@ -4845,6 +4960,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_obj,
                        .done = nf_tables_dump_obj_done,
+                       .module = THIS_MODULE,
                };
 
                if (nla[NFTA_OBJ_TABLE] ||
@@ -4857,7 +4973,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 
                        c.data = filter;
                }
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        if (!nla[NFTA_OBJ_NAME] ||
@@ -4877,7 +4993,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
                return PTR_ERR(obj);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -4896,10 +5012,10 @@ err:
        return err;
 }
 
-static void nft_obj_destroy(struct nft_object *obj)
+static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
 {
        if (obj->ops->destroy)
-               obj->ops->destroy(obj);
+               obj->ops->destroy(ctx, obj);
 
        module_put(obj->ops->type->owner);
        kfree(obj->name);
@@ -5022,7 +5138,7 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
 {
        struct nft_flowtable *flowtable;
 
-       list_for_each_entry(flowtable, &table->flowtables, list) {
+       list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
                if (!nla_strcmp(nla, flowtable->name) &&
                    nft_active_genmask(flowtable, genmask))
                        return flowtable;
@@ -5483,13 +5599,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
 {
        struct nft_flowtable_filter *filter;
 
-       filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+       filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
        if (!filter)
                return ERR_PTR(-ENOMEM);
 
        if (nla[NFTA_FLOWTABLE_TABLE]) {
                filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
-                                          GFP_KERNEL);
+                                          GFP_ATOMIC);
                if (!filter->table) {
                        kfree(filter);
                        return ERR_PTR(-ENOMEM);
@@ -5498,6 +5614,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
        return filter;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
                                  struct sk_buff *skb,
                                  const struct nlmsghdr *nlh,
@@ -5516,6 +5633,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_flowtable,
                        .done = nf_tables_dump_flowtable_done,
+                       .module = THIS_MODULE,
                };
 
                if (nla[NFTA_FLOWTABLE_TABLE]) {
@@ -5527,7 +5645,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 
                        c.data = filter;
                }
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        if (!nla[NFTA_FLOWTABLE_NAME])
@@ -5543,7 +5661,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
        if (IS_ERR(flowtable))
                return PTR_ERR(flowtable);
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -5707,7 +5825,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
        struct sk_buff *skb2;
        int err;
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;
 
@@ -5729,7 +5847,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_table_policy,
        },
        [NFT_MSG_GETTABLE] = {
-               .call           = nf_tables_gettable,
+               .call_rcu       = nf_tables_gettable,
                .attr_count     = NFTA_TABLE_MAX,
                .policy         = nft_table_policy,
        },
@@ -5744,7 +5862,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_chain_policy,
        },
        [NFT_MSG_GETCHAIN] = {
-               .call           = nf_tables_getchain,
+               .call_rcu       = nf_tables_getchain,
                .attr_count     = NFTA_CHAIN_MAX,
                .policy         = nft_chain_policy,
        },
@@ -5759,7 +5877,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_rule_policy,
        },
        [NFT_MSG_GETRULE] = {
-               .call           = nf_tables_getrule,
+               .call_rcu       = nf_tables_getrule,
                .attr_count     = NFTA_RULE_MAX,
                .policy         = nft_rule_policy,
        },
@@ -5774,7 +5892,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_set_policy,
        },
        [NFT_MSG_GETSET] = {
-               .call           = nf_tables_getset,
+               .call_rcu       = nf_tables_getset,
                .attr_count     = NFTA_SET_MAX,
                .policy         = nft_set_policy,
        },
@@ -5789,7 +5907,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETSETELEM] = {
-               .call           = nf_tables_getsetelem,
+               .call_rcu       = nf_tables_getsetelem,
                .attr_count     = NFTA_SET_ELEM_LIST_MAX,
                .policy         = nft_set_elem_list_policy,
        },
@@ -5799,7 +5917,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETGEN] = {
-               .call           = nf_tables_getgen,
+               .call_rcu       = nf_tables_getgen,
        },
        [NFT_MSG_NEWOBJ] = {
                .call_batch     = nf_tables_newobj,
@@ -5807,7 +5925,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ] = {
-               .call           = nf_tables_getobj,
+               .call_rcu       = nf_tables_getobj,
                .attr_count     = NFTA_OBJ_MAX,
                .policy         = nft_obj_policy,
        },
@@ -5817,7 +5935,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ_RESET] = {
-               .call           = nf_tables_getobj,
+               .call_rcu       = nf_tables_getobj,
                .attr_count     = NFTA_OBJ_MAX,
                .policy         = nft_obj_policy,
        },
@@ -5827,7 +5945,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_flowtable_policy,
        },
        [NFT_MSG_GETFLOWTABLE] = {
-               .call           = nf_tables_getflowtable,
+               .call_rcu       = nf_tables_getflowtable,
                .attr_count     = NFTA_FLOWTABLE_MAX,
                .policy         = nft_flowtable_policy,
        },
@@ -5838,6 +5956,27 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
        },
 };
 
+static int nf_tables_validate(struct net *net)
+{
+       struct nft_table *table;
+
+       switch (net->nft.validate_state) {
+       case NFT_VALIDATE_SKIP:
+               break;
+       case NFT_VALIDATE_NEED:
+               nft_validate_state_update(net, NFT_VALIDATE_DO);
+               /* fall through */
+       case NFT_VALIDATE_DO:
+               list_for_each_entry(table, &net->nft.tables, list) {
+                       if (nft_table_validate(net, table) < 0)
+                               return -EAGAIN;
+               }
+               break;
+       }
+
+       return 0;
+}
+
 static void nft_chain_commit_update(struct nft_trans *trans)
 {
        struct nft_base_chain *basechain;
@@ -5875,11 +6014,12 @@ static void nft_commit_release(struct nft_trans *trans)
                nft_set_destroy(nft_trans_set(trans));
                break;
        case NFT_MSG_DELSETELEM:
-               nf_tables_set_elem_destroy(nft_trans_elem_set(trans),
+               nf_tables_set_elem_destroy(&trans->ctx,
+                                          nft_trans_elem_set(trans),
                                           nft_trans_elem(trans).priv);
                break;
        case NFT_MSG_DELOBJ:
-               nft_obj_destroy(nft_trans_obj(trans));
+               nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
                break;
        case NFT_MSG_DELFLOWTABLE:
                nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -5903,21 +6043,166 @@ static void nf_tables_commit_release(struct net *net)
        }
 }
 
+static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
+{
+       struct nft_rule *rule;
+       unsigned int alloc = 0;
+       int i;
+
+       /* already handled or inactive chain? */
+       if (chain->rules_next || !nft_is_active_next(net, chain))
+               return 0;
+
+       rule = list_entry(&chain->rules, struct nft_rule, list);
+       i = 0;
+
+       list_for_each_entry_continue(rule, &chain->rules, list) {
+               if (nft_is_active_next(net, rule))
+                       alloc++;
+       }
+
+       chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
+       if (!chain->rules_next)
+               return -ENOMEM;
+
+       list_for_each_entry_continue(rule, &chain->rules, list) {
+               if (nft_is_active_next(net, rule))
+                       chain->rules_next[i++] = rule;
+       }
+
+       chain->rules_next[i] = NULL;
+       return 0;
+}
+
+static void nf_tables_commit_chain_prepare_cancel(struct net *net)
+{
+       struct nft_trans *trans, *next;
+
+       list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+               struct nft_chain *chain = trans->ctx.chain;
+
+               if (trans->msg_type == NFT_MSG_NEWRULE ||
+                   trans->msg_type == NFT_MSG_DELRULE) {
+                       kvfree(chain->rules_next);
+                       chain->rules_next = NULL;
+               }
+       }
+}
+
+static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+{
+       struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+
+       kvfree(o->start);
+}
+
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+{
+       struct nft_rule **r = rules;
+       struct nft_rules_old *old;
+
+       while (*r)
+               r++;
+
+       r++;    /* rcu_head is after end marker */
+       old = (void *) r;
+       old->start = rules;
+
+       call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+}
+
+static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
+{
+       struct nft_rule **g0, **g1;
+       bool next_genbit;
+
+       next_genbit = nft_gencursor_next(net);
+
+       g0 = rcu_dereference_protected(chain->rules_gen_0,
+                                      lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+       g1 = rcu_dereference_protected(chain->rules_gen_1,
+                                      lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+       /* No changes to this chain? */
+       if (chain->rules_next == NULL) {
+               /* chain had no change in last or next generation */
+               if (g0 == g1)
+                       return;
+               /*
+                * chain had no change in this generation; make sure next
+                * one uses same rules as current generation.
+                */
+               if (next_genbit) {
+                       rcu_assign_pointer(chain->rules_gen_1, g0);
+                       nf_tables_commit_chain_free_rules_old(g1);
+               } else {
+                       rcu_assign_pointer(chain->rules_gen_0, g1);
+                       nf_tables_commit_chain_free_rules_old(g0);
+               }
+
+               return;
+       }
+
+       if (next_genbit)
+               rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+       else
+               rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+
+       chain->rules_next = NULL;
+
+       if (g0 == g1)
+               return;
+
+       if (next_genbit)
+               nf_tables_commit_chain_free_rules_old(g1);
+       else
+               nf_tables_commit_chain_free_rules_old(g0);
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
        struct nft_trans *trans, *next;
        struct nft_trans_elem *te;
+       struct nft_chain *chain;
+       struct nft_table *table;
 
-       /* Bump generation counter, invalidate any dump in progress */
-       while (++net->nft.base_seq == 0);
+       /* 0. Validate ruleset, otherwise roll back for error reporting. */
+       if (nf_tables_validate(net) < 0)
+               return -EAGAIN;
 
-       /* A new generation has just started */
-       net->nft.gencursor = nft_gencursor_next(net);
+       /* 1.  Allocate space for next generation rules_gen_X[] */
+       list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+               int ret;
+
+               if (trans->msg_type == NFT_MSG_NEWRULE ||
+                   trans->msg_type == NFT_MSG_DELRULE) {
+                       chain = trans->ctx.chain;
+
+                       ret = nf_tables_commit_chain_prepare(net, chain);
+                       if (ret < 0) {
+                               nf_tables_commit_chain_prepare_cancel(net);
+                               return ret;
+                       }
+               }
+       }
+
+       /* step 2.  Make rules_gen_X visible to packet path */
+       list_for_each_entry(table, &net->nft.tables, list) {
+               list_for_each_entry(chain, &table->chains, list) {
+                       if (!nft_is_active_next(net, chain))
+                               continue;
+                       nf_tables_commit_chain_active(net, chain);
+               }
+       }
 
-       /* Make sure all packets have left the previous generation before
-        * purging old rules.
+       /*
+        * Bump generation counter, invalidate any dump in progress.
+        * Cannot fail after this point.
         */
-       synchronize_rcu();
+       while (++net->nft.base_seq == 0);
+
+       /* step 3. Start new generation, rules_gen_X now in use. */
+       net->nft.gencursor = nft_gencursor_next(net);
 
        list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
                switch (trans->msg_type) {
@@ -6059,7 +6344,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
                                     nft_trans_elem(trans).priv, true);
                break;
        case NFT_MSG_NEWOBJ:
-               nft_obj_destroy(nft_trans_obj(trans));
+               nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
                break;
        case NFT_MSG_NEWFLOWTABLE:
                nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -6179,6 +6464,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
        return 0;
 }
 
+static void nf_tables_cleanup(struct net *net)
+{
+       nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+}
+
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
        return net->nft.base_seq == genid;
@@ -6191,6 +6481,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
        .cb             = nf_tables_cb,
        .commit         = nf_tables_commit,
        .abort          = nf_tables_abort,
+       .cleanup        = nf_tables_cleanup,
        .valid_genid    = nf_tables_valid_genid,
 };
 
@@ -6274,19 +6565,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 
        list_for_each_entry(rule, &chain->rules, list) {
                nft_rule_for_each_expr(expr, last, rule) {
-                       const struct nft_data *data = NULL;
+                       struct nft_immediate_expr *priv;
+                       const struct nft_data *data;
                        int err;
 
-                       if (!expr->ops->validate)
+                       if (strcmp(expr->ops->type->name, "immediate"))
                                continue;
 
-                       err = expr->ops->validate(ctx, expr, &data);
-                       if (err < 0)
-                               return err;
-
-                       if (data == NULL)
+                       priv = nft_expr_priv(expr);
+                       if (priv->dreg != NFT_REG_VERDICT)
                                continue;
 
+                       data = &priv->data;
                        switch (data->verdict.code) {
                        case NFT_JUMP:
                        case NFT_GOTO:
@@ -6748,7 +7038,7 @@ static void __nft_release_tables(struct net *net)
                list_for_each_entry_safe(obj, ne, &table->objects, list) {
                        list_del(&obj->list);
                        table->use--;
-                       nft_obj_destroy(obj);
+                       nft_obj_destroy(&ctx, obj);
                }
                list_for_each_entry_safe(chain, nc, &table->chains, list) {
                        ctx.chain = chain;
@@ -6766,6 +7056,8 @@ static int __net_init nf_tables_init_net(struct net *net)
        INIT_LIST_HEAD(&net->nft.tables);
        INIT_LIST_HEAD(&net->nft.commit_list);
        net->nft.base_seq = 1;
+       net->nft.validate_state = NFT_VALIDATE_SKIP;
+
        return 0;
 }