1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
5 * Development of this code funded by Astaro AG (http://www.astaro.com/)
8 #include <linux/kernel.h>
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/list.h>
12 #include <linux/rbtree.h>
13 #include <linux/netlink.h>
14 #include <linux/netfilter.h>
15 #include <linux/netfilter/nf_tables.h>
16 #include <net/netfilter/nf_tables.h>
22 struct delayed_work gc_work;
25 struct nft_rbtree_elem {
27 struct nft_set_ext ext;
30 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
32 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
33 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
36 static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
37 const struct nft_rbtree_elem *interval)
39 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
42 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
43 const u32 *key, const struct nft_set_ext **ext,
46 struct nft_rbtree *priv = nft_set_priv(set);
47 const struct nft_rbtree_elem *rbe, *interval = NULL;
48 u8 genmask = nft_genmask_cur(net);
49 const struct rb_node *parent;
53 parent = rcu_dereference_raw(priv->root.rb_node);
54 while (parent != NULL) {
55 if (read_seqcount_retry(&priv->count, seq))
58 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
60 this = nft_set_ext_key(&rbe->ext);
61 d = memcmp(this, key, set->klen);
63 parent = rcu_dereference_raw(parent->rb_left);
65 nft_rbtree_equal(set, this, interval) &&
66 nft_rbtree_interval_end(rbe) &&
67 !nft_rbtree_interval_end(interval))
71 parent = rcu_dereference_raw(parent->rb_right);
73 if (!nft_set_elem_active(&rbe->ext, genmask)) {
74 parent = rcu_dereference_raw(parent->rb_left);
77 if (nft_rbtree_interval_end(rbe))
85 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
86 nft_set_elem_active(&interval->ext, genmask) &&
87 !nft_rbtree_interval_end(interval)) {
88 *ext = &interval->ext;
95 static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
96 const u32 *key, const struct nft_set_ext **ext)
98 struct nft_rbtree *priv = nft_set_priv(set);
99 unsigned int seq = read_seqcount_begin(&priv->count);
102 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
103 if (ret || !read_seqcount_retry(&priv->count, seq))
106 read_lock_bh(&priv->lock);
107 seq = read_seqcount_begin(&priv->count);
108 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
109 read_unlock_bh(&priv->lock);
114 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
115 const u32 *key, struct nft_rbtree_elem **elem,
116 unsigned int seq, unsigned int flags, u8 genmask)
118 struct nft_rbtree_elem *rbe, *interval = NULL;
119 struct nft_rbtree *priv = nft_set_priv(set);
120 const struct rb_node *parent;
124 parent = rcu_dereference_raw(priv->root.rb_node);
125 while (parent != NULL) {
126 if (read_seqcount_retry(&priv->count, seq))
129 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
131 this = nft_set_ext_key(&rbe->ext);
132 d = memcmp(this, key, set->klen);
134 parent = rcu_dereference_raw(parent->rb_left);
135 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
138 parent = rcu_dereference_raw(parent->rb_right);
139 if (flags & NFT_SET_ELEM_INTERVAL_END)
142 if (!nft_set_elem_active(&rbe->ext, genmask))
143 parent = rcu_dereference_raw(parent->rb_left);
145 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
146 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
147 (flags & NFT_SET_ELEM_INTERVAL_END)) {
155 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
156 nft_set_elem_active(&interval->ext, genmask) &&
157 ((!nft_rbtree_interval_end(interval) &&
158 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
159 (nft_rbtree_interval_end(interval) &&
160 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
168 static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
169 const struct nft_set_elem *elem, unsigned int flags)
171 struct nft_rbtree *priv = nft_set_priv(set);
172 unsigned int seq = read_seqcount_begin(&priv->count);
173 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
174 const u32 *key = (const u32 *)&elem->key.val;
175 u8 genmask = nft_genmask_cur(net);
178 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
179 if (ret || !read_seqcount_retry(&priv->count, seq))
182 read_lock_bh(&priv->lock);
183 seq = read_seqcount_begin(&priv->count);
184 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
186 rbe = ERR_PTR(-ENOENT);
187 read_unlock_bh(&priv->lock);
192 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
193 struct nft_rbtree_elem *new,
194 struct nft_set_ext **ext)
196 struct nft_rbtree *priv = nft_set_priv(set);
197 u8 genmask = nft_genmask_next(net);
198 struct nft_rbtree_elem *rbe;
199 struct rb_node *parent, **p;
203 p = &priv->root.rb_node;
206 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
207 d = memcmp(nft_set_ext_key(&rbe->ext),
208 nft_set_ext_key(&new->ext),
211 p = &parent->rb_left;
213 p = &parent->rb_right;
215 if (nft_rbtree_interval_end(rbe) &&
216 !nft_rbtree_interval_end(new)) {
217 p = &parent->rb_left;
218 } else if (!nft_rbtree_interval_end(rbe) &&
219 nft_rbtree_interval_end(new)) {
220 p = &parent->rb_right;
221 } else if (nft_set_elem_active(&rbe->ext, genmask)) {
225 p = &parent->rb_left;
229 rb_link_node_rcu(&new->node, parent, p);
230 rb_insert_color(&new->node, &priv->root);
234 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
235 const struct nft_set_elem *elem,
236 struct nft_set_ext **ext)
238 struct nft_rbtree *priv = nft_set_priv(set);
239 struct nft_rbtree_elem *rbe = elem->priv;
242 write_lock_bh(&priv->lock);
243 write_seqcount_begin(&priv->count);
244 err = __nft_rbtree_insert(net, set, rbe, ext);
245 write_seqcount_end(&priv->count);
246 write_unlock_bh(&priv->lock);
251 static void nft_rbtree_remove(const struct net *net,
252 const struct nft_set *set,
253 const struct nft_set_elem *elem)
255 struct nft_rbtree *priv = nft_set_priv(set);
256 struct nft_rbtree_elem *rbe = elem->priv;
258 write_lock_bh(&priv->lock);
259 write_seqcount_begin(&priv->count);
260 rb_erase(&rbe->node, &priv->root);
261 write_seqcount_end(&priv->count);
262 write_unlock_bh(&priv->lock);
265 static void nft_rbtree_activate(const struct net *net,
266 const struct nft_set *set,
267 const struct nft_set_elem *elem)
269 struct nft_rbtree_elem *rbe = elem->priv;
271 nft_set_elem_change_active(net, set, &rbe->ext);
272 nft_set_elem_clear_busy(&rbe->ext);
275 static bool nft_rbtree_flush(const struct net *net,
276 const struct nft_set *set, void *priv)
278 struct nft_rbtree_elem *rbe = priv;
280 if (!nft_set_elem_mark_busy(&rbe->ext) ||
281 !nft_is_active(net, &rbe->ext)) {
282 nft_set_elem_change_active(net, set, &rbe->ext);
288 static void *nft_rbtree_deactivate(const struct net *net,
289 const struct nft_set *set,
290 const struct nft_set_elem *elem)
292 const struct nft_rbtree *priv = nft_set_priv(set);
293 const struct rb_node *parent = priv->root.rb_node;
294 struct nft_rbtree_elem *rbe, *this = elem->priv;
295 u8 genmask = nft_genmask_next(net);
298 while (parent != NULL) {
299 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
301 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
304 parent = parent->rb_left;
306 parent = parent->rb_right;
308 if (nft_rbtree_interval_end(rbe) &&
309 !nft_rbtree_interval_end(this)) {
310 parent = parent->rb_left;
312 } else if (!nft_rbtree_interval_end(rbe) &&
313 nft_rbtree_interval_end(this)) {
314 parent = parent->rb_right;
316 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
317 parent = parent->rb_left;
320 nft_rbtree_flush(net, set, rbe);
327 static void nft_rbtree_walk(const struct nft_ctx *ctx,
329 struct nft_set_iter *iter)
331 struct nft_rbtree *priv = nft_set_priv(set);
332 struct nft_rbtree_elem *rbe;
333 struct nft_set_elem elem;
334 struct rb_node *node;
336 read_lock_bh(&priv->lock);
337 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
338 rbe = rb_entry(node, struct nft_rbtree_elem, node);
340 if (iter->count < iter->skip)
342 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
347 iter->err = iter->fn(ctx, set, iter, &elem);
349 read_unlock_bh(&priv->lock);
355 read_unlock_bh(&priv->lock);
358 static void nft_rbtree_gc(struct work_struct *work)
360 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
361 struct nft_set_gc_batch *gcb = NULL;
362 struct nft_rbtree *priv;
363 struct rb_node *node;
366 priv = container_of(work, struct nft_rbtree, gc_work.work);
367 set = nft_set_container_of(priv);
369 write_lock_bh(&priv->lock);
370 write_seqcount_begin(&priv->count);
371 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
372 rbe = rb_entry(node, struct nft_rbtree_elem, node);
374 if (nft_rbtree_interval_end(rbe)) {
378 if (!nft_set_elem_expired(&rbe->ext))
380 if (nft_set_elem_mark_busy(&rbe->ext))
384 rb_erase(&rbe_prev->node, &priv->root);
387 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
391 atomic_dec(&set->nelems);
392 nft_set_gc_batch_add(gcb, rbe);
396 atomic_dec(&set->nelems);
397 nft_set_gc_batch_add(gcb, rbe_end);
398 rb_erase(&rbe_end->node, &priv->root);
401 node = rb_next(node);
406 rb_erase(&rbe_prev->node, &priv->root);
407 write_seqcount_end(&priv->count);
408 write_unlock_bh(&priv->lock);
410 nft_set_gc_batch_complete(gcb);
412 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
413 nft_set_gc_interval(set));
416 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
417 const struct nft_set_desc *desc)
419 return sizeof(struct nft_rbtree);
422 static int nft_rbtree_init(const struct nft_set *set,
423 const struct nft_set_desc *desc,
424 const struct nlattr * const nla[])
426 struct nft_rbtree *priv = nft_set_priv(set);
428 rwlock_init(&priv->lock);
429 seqcount_init(&priv->count);
430 priv->root = RB_ROOT;
432 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
433 if (set->flags & NFT_SET_TIMEOUT)
434 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
435 nft_set_gc_interval(set));
440 static void nft_rbtree_destroy(const struct nft_set *set)
442 struct nft_rbtree *priv = nft_set_priv(set);
443 struct nft_rbtree_elem *rbe;
444 struct rb_node *node;
446 cancel_delayed_work_sync(&priv->gc_work);
448 while ((node = priv->root.rb_node) != NULL) {
449 rb_erase(node, &priv->root);
450 rbe = rb_entry(node, struct nft_rbtree_elem, node);
451 nft_set_elem_destroy(set, rbe, true);
455 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
456 struct nft_set_estimate *est)
459 est->size = sizeof(struct nft_rbtree) +
460 desc->size * sizeof(struct nft_rbtree_elem);
464 est->lookup = NFT_SET_CLASS_O_LOG_N;
465 est->space = NFT_SET_CLASS_O_N;
470 struct nft_set_type nft_set_rbtree_type __read_mostly = {
471 .owner = THIS_MODULE,
472 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
474 .privsize = nft_rbtree_privsize,
475 .elemsize = offsetof(struct nft_rbtree_elem, ext),
476 .estimate = nft_rbtree_estimate,
477 .init = nft_rbtree_init,
478 .destroy = nft_rbtree_destroy,
479 .insert = nft_rbtree_insert,
480 .remove = nft_rbtree_remove,
481 .deactivate = nft_rbtree_deactivate,
482 .flush = nft_rbtree_flush,
483 .activate = nft_rbtree_activate,
484 .lookup = nft_rbtree_lookup,
485 .walk = nft_rbtree_walk,
486 .get = nft_rbtree_get,