1 // SPDX-License-Identifier: GPL-2.0-only
3 * Vxlan vni filter for collect metadata mode
5 * Authors: Roopa Prabhu <roopa@nvidia.com>
9 #include <linux/kernel.h>
10 #include <linux/slab.h>
11 #include <linux/etherdevice.h>
12 #include <linux/rhashtable.h>
13 #include <net/rtnetlink.h>
14 #include <net/net_namespace.h>
16 #include <net/vxlan.h>
18 #include "vxlan_private.h"
20 static inline int vxlan_vni_cmp(struct rhashtable_compare_arg *arg,
23 const struct vxlan_vni_node *vnode = ptr;
24 __be32 vni = *(__be32 *)arg->key;
26 return vnode->vni != vni;
29 const struct rhashtable_params vxlan_vni_rht_params = {
30 .head_offset = offsetof(struct vxlan_vni_node, vnode),
31 .key_offset = offsetof(struct vxlan_vni_node, vni),
32 .key_len = sizeof(__be32),
34 .max_size = VXLAN_N_VID,
35 .obj_cmpfn = vxlan_vni_cmp,
36 .automatic_shrinking = true,
39 static void vxlan_vs_add_del_vninode(struct vxlan_dev *vxlan,
40 struct vxlan_vni_node *v,
43 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
44 struct vxlan_dev_node *node;
45 struct vxlan_sock *vs;
47 spin_lock(&vn->sock_lock);
49 if (!hlist_unhashed(&v->hlist4.hlist))
50 hlist_del_init_rcu(&v->hlist4.hlist);
51 #if IS_ENABLED(CONFIG_IPV6)
52 if (!hlist_unhashed(&v->hlist6.hlist))
53 hlist_del_init_rcu(&v->hlist6.hlist);
58 #if IS_ENABLED(CONFIG_IPV6)
59 vs = rtnl_dereference(vxlan->vn6_sock);
62 hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni));
65 vs = rtnl_dereference(vxlan->vn4_sock);
68 hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni));
71 spin_unlock(&vn->sock_lock);
74 void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan,
75 struct vxlan_sock *vs,
78 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
79 struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp);
80 struct vxlan_vni_node *v, *tmp;
81 struct vxlan_dev_node *node;
86 spin_lock(&vn->sock_lock);
87 list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
88 #if IS_ENABLED(CONFIG_IPV6)
95 hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni));
97 spin_unlock(&vn->sock_lock);
100 void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan)
102 struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp);
103 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
104 struct vxlan_vni_node *v, *tmp;
109 spin_lock(&vn->sock_lock);
110 list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
111 hlist_del_init_rcu(&v->hlist4.hlist);
112 #if IS_ENABLED(CONFIG_IPV6)
113 hlist_del_init_rcu(&v->hlist6.hlist);
116 spin_unlock(&vn->sock_lock);
119 static u32 vnirange(struct vxlan_vni_node *vbegin,
120 struct vxlan_vni_node *vend)
122 return (be32_to_cpu(vend->vni) - be32_to_cpu(vbegin->vni));
125 static size_t vxlan_vnifilter_entry_nlmsg_size(void)
127 return NLMSG_ALIGN(sizeof(struct tunnel_msg))
128 + nla_total_size(0) /* VXLAN_VNIFILTER_ENTRY */
129 + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_START */
130 + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_END */
131 + nla_total_size(sizeof(struct in6_addr));/* VXLAN_VNIFILTER_ENTRY_GROUP{6} */
134 static bool vxlan_fill_vni_filter_entry(struct sk_buff *skb,
135 struct vxlan_vni_node *vbegin,
136 struct vxlan_vni_node *vend)
138 struct nlattr *ventry;
139 u32 vs = be32_to_cpu(vbegin->vni);
143 ve = be32_to_cpu(vend->vni);
145 ventry = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY);
149 if (nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_START, vs))
152 if (ve && nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_END, ve))
155 if (!vxlan_addr_any(&vbegin->remote_ip)) {
156 if (vbegin->remote_ip.sa.sa_family == AF_INET) {
157 if (nla_put_in_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP,
158 vbegin->remote_ip.sin.sin_addr.s_addr))
160 #if IS_ENABLED(CONFIG_IPV6)
162 if (nla_put_in6_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP6,
163 &vbegin->remote_ip.sin6.sin6_addr))
169 nla_nest_end(skb, ventry);
174 nla_nest_cancel(skb, ventry);
179 static void vxlan_vnifilter_notify(const struct vxlan_dev *vxlan,
180 struct vxlan_vni_node *vninode, int cmd)
182 struct tunnel_msg *tmsg;
184 struct nlmsghdr *nlh;
185 struct net *net = dev_net(vxlan->dev);
188 skb = nlmsg_new(vxlan_vnifilter_entry_nlmsg_size(), GFP_KERNEL);
193 nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*tmsg), 0);
196 tmsg = nlmsg_data(nlh);
197 memset(tmsg, 0, sizeof(*tmsg));
198 tmsg->family = AF_BRIDGE;
199 tmsg->ifindex = vxlan->dev->ifindex;
201 if (!vxlan_fill_vni_filter_entry(skb, vninode, vninode))
205 rtnl_notify(skb, net, 0, RTNLGRP_TUNNEL, NULL, GFP_KERNEL);
210 rtnl_set_sk_err(net, RTNLGRP_TUNNEL, err);
215 static int vxlan_vnifilter_dump_dev(const struct net_device *dev,
217 struct netlink_callback *cb)
219 struct vxlan_vni_node *tmp, *v, *vbegin = NULL, *vend = NULL;
220 struct vxlan_dev *vxlan = netdev_priv(dev);
221 struct tunnel_msg *new_tmsg;
222 int idx = 0, s_idx = cb->args[1];
223 struct vxlan_vni_group *vg;
224 struct nlmsghdr *nlh;
227 if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
230 /* RCU needed because of the vni locking rules (rcu || rtnl) */
231 vg = rcu_dereference(vxlan->vnigrp);
232 if (!vg || !vg->num_vnis)
235 nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
236 RTM_NEWTUNNEL, sizeof(*new_tmsg), NLM_F_MULTI);
239 new_tmsg = nlmsg_data(nlh);
240 memset(new_tmsg, 0, sizeof(*new_tmsg));
241 new_tmsg->family = PF_BRIDGE;
242 new_tmsg->ifindex = dev->ifindex;
244 list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
254 if (vnirange(vend, v) == 1 &&
255 vxlan_addr_equal(&v->remote_ip, &vend->remote_ip)) {
258 if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend)) {
262 idx += vnirange(vbegin, vend) + 1;
269 if (!err && vbegin) {
270 if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend))
274 cb->args[1] = err ? idx : 0;
281 static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb)
283 int idx = 0, err = 0, s_idx = cb->args[0];
284 struct net *net = sock_net(skb->sk);
285 struct tunnel_msg *tmsg;
286 struct net_device *dev;
288 tmsg = nlmsg_data(cb->nlh);
292 dev = dev_get_by_index_rcu(net, tmsg->ifindex);
297 err = vxlan_vnifilter_dump_dev(dev, skb, cb);
298 /* if the dump completed without an error we return 0 here */
299 if (err != -EMSGSIZE)
302 for_each_netdev_rcu(net, dev) {
303 if (!netif_is_vxlan(dev))
307 err = vxlan_vnifilter_dump_dev(dev, skb, cb);
308 if (err == -EMSGSIZE)
325 static const struct nla_policy vni_filter_entry_policy[VXLAN_VNIFILTER_ENTRY_MAX + 1] = {
326 [VXLAN_VNIFILTER_ENTRY_START] = { .type = NLA_U32 },
327 [VXLAN_VNIFILTER_ENTRY_END] = { .type = NLA_U32 },
328 [VXLAN_VNIFILTER_ENTRY_GROUP] = { .type = NLA_BINARY,
329 .len = sizeof_field(struct iphdr, daddr) },
330 [VXLAN_VNIFILTER_ENTRY_GROUP6] = { .type = NLA_BINARY,
331 .len = sizeof(struct in6_addr) },
334 static const struct nla_policy vni_filter_policy[VXLAN_VNIFILTER_MAX + 1] = {
335 [VXLAN_VNIFILTER_ENTRY] = { .type = NLA_NESTED },
338 static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni,
339 union vxlan_addr *old_remote_ip,
340 union vxlan_addr *remote_ip,
341 struct netlink_ext_ack *extack)
343 struct vxlan_rdst *dst = &vxlan->default_dst;
347 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
348 spin_lock_bh(&vxlan->hash_lock[hash_index]);
349 if (remote_ip && !vxlan_addr_any(remote_ip)) {
350 err = vxlan_fdb_update(vxlan, all_zeros_mac,
352 NUD_REACHABLE | NUD_PERMANENT,
353 NLM_F_APPEND | NLM_F_CREATE,
358 NTF_SELF, 0, true, extack);
360 spin_unlock_bh(&vxlan->hash_lock[hash_index]);
365 if (old_remote_ip && !vxlan_addr_any(old_remote_ip)) {
366 __vxlan_fdb_delete(vxlan, all_zeros_mac,
373 spin_unlock_bh(&vxlan->hash_lock[hash_index]);
378 static int vxlan_vni_update_group(struct vxlan_dev *vxlan,
379 struct vxlan_vni_node *vninode,
380 union vxlan_addr *group,
381 bool create, bool *changed,
382 struct netlink_ext_ack *extack)
384 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
385 struct vxlan_rdst *dst = &vxlan->default_dst;
386 union vxlan_addr *newrip = NULL, *oldrip = NULL;
387 union vxlan_addr old_remote_ip;
390 memcpy(&old_remote_ip, &vninode->remote_ip, sizeof(old_remote_ip));
392 /* if per vni remote ip is not present use vxlan dev
393 * default dst remote ip for fdb entry
395 if (group && !vxlan_addr_any(group)) {
398 if (!vxlan_addr_any(&dst->remote_ip))
399 newrip = &dst->remote_ip;
402 /* if old rip exists, and no newrip,
403 * explicitly delete old rip
405 if (!newrip && !vxlan_addr_any(&old_remote_ip))
406 oldrip = &old_remote_ip;
408 if (!newrip && !oldrip)
411 if (!create && oldrip && newrip && vxlan_addr_equal(oldrip, newrip))
414 ret = vxlan_update_default_fdb_entry(vxlan, vninode->vni,
421 memcpy(&vninode->remote_ip, group, sizeof(vninode->remote_ip));
423 if (vxlan->dev->flags & IFF_UP) {
424 if (vxlan_addr_multicast(&old_remote_ip) &&
425 !vxlan_group_used(vn, vxlan, vninode->vni,
427 vxlan->default_dst.remote_ifindex)) {
428 ret = vxlan_igmp_leave(vxlan, &old_remote_ip,
434 if (vxlan_addr_multicast(&vninode->remote_ip)) {
435 ret = vxlan_igmp_join(vxlan, &vninode->remote_ip, 0);
436 if (ret == -EADDRINUSE)
450 int vxlan_vnilist_update_group(struct vxlan_dev *vxlan,
451 union vxlan_addr *old_remote_ip,
452 union vxlan_addr *new_remote_ip,
453 struct netlink_ext_ack *extack)
455 struct list_head *headp, *hpos;
456 struct vxlan_vni_group *vg;
457 struct vxlan_vni_node *vent;
460 vg = rtnl_dereference(vxlan->vnigrp);
462 headp = &vg->vni_list;
463 list_for_each_prev(hpos, headp) {
464 vent = list_entry(hpos, struct vxlan_vni_node, vlist);
465 if (vxlan_addr_any(&vent->remote_ip)) {
466 ret = vxlan_update_default_fdb_entry(vxlan, vent->vni,
478 static void vxlan_vni_delete_group(struct vxlan_dev *vxlan,
479 struct vxlan_vni_node *vninode)
481 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
482 struct vxlan_rdst *dst = &vxlan->default_dst;
484 /* if per vni remote_ip not present, delete the
485 * default dst remote_ip previously added for this vni
487 if (!vxlan_addr_any(&vninode->remote_ip) ||
488 !vxlan_addr_any(&dst->remote_ip))
489 __vxlan_fdb_delete(vxlan, all_zeros_mac,
490 (vxlan_addr_any(&vninode->remote_ip) ?
491 dst->remote_ip : vninode->remote_ip),
493 vninode->vni, vninode->vni,
497 if (vxlan->dev->flags & IFF_UP) {
498 if (vxlan_addr_multicast(&vninode->remote_ip) &&
499 !vxlan_group_used(vn, vxlan, vninode->vni,
501 dst->remote_ifindex)) {
502 vxlan_igmp_leave(vxlan, &vninode->remote_ip, 0);
507 static int vxlan_vni_update(struct vxlan_dev *vxlan,
508 struct vxlan_vni_group *vg,
509 __be32 vni, union vxlan_addr *group,
511 struct netlink_ext_ack *extack)
513 struct vxlan_vni_node *vninode;
516 vninode = rhashtable_lookup_fast(&vg->vni_hash, &vni,
517 vxlan_vni_rht_params);
521 ret = vxlan_vni_update_group(vxlan, vninode, group, false, changed,
527 vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL);
532 static void __vxlan_vni_add_list(struct vxlan_vni_group *vg,
533 struct vxlan_vni_node *v)
535 struct list_head *headp, *hpos;
536 struct vxlan_vni_node *vent;
538 headp = &vg->vni_list;
539 list_for_each_prev(hpos, headp) {
540 vent = list_entry(hpos, struct vxlan_vni_node, vlist);
541 if (be32_to_cpu(v->vni) < be32_to_cpu(vent->vni))
546 list_add_rcu(&v->vlist, hpos);
550 static void __vxlan_vni_del_list(struct vxlan_vni_group *vg,
551 struct vxlan_vni_node *v)
553 list_del_rcu(&v->vlist);
557 static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan,
560 struct vxlan_vni_node *vninode;
562 vninode = kzalloc(sizeof(*vninode), GFP_ATOMIC);
566 vninode->hlist4.vxlan = vxlan;
567 #if IS_ENABLED(CONFIG_IPV6)
568 vninode->hlist6.vxlan = vxlan;
574 static int vxlan_vni_add(struct vxlan_dev *vxlan,
575 struct vxlan_vni_group *vg,
576 u32 vni, union vxlan_addr *group,
577 struct netlink_ext_ack *extack)
579 struct vxlan_vni_node *vninode;
580 __be32 v = cpu_to_be32(vni);
581 bool changed = false;
584 if (vxlan_vnifilter_lookup(vxlan, v))
585 return vxlan_vni_update(vxlan, vg, v, group, &changed, extack);
587 err = vxlan_vni_in_use(vxlan->net, vxlan, &vxlan->cfg, v);
589 NL_SET_ERR_MSG(extack, "VNI in use");
593 vninode = vxlan_vni_alloc(vxlan, v);
597 err = rhashtable_lookup_insert_fast(&vg->vni_hash,
599 vxlan_vni_rht_params);
605 __vxlan_vni_add_list(vg, vninode);
607 if (vxlan->dev->flags & IFF_UP)
608 vxlan_vs_add_del_vninode(vxlan, vninode, false);
610 err = vxlan_vni_update_group(vxlan, vninode, group, true, &changed,
614 vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL);
619 static void vxlan_vni_node_rcu_free(struct rcu_head *rcu)
621 struct vxlan_vni_node *v;
623 v = container_of(rcu, struct vxlan_vni_node, rcu);
627 static int vxlan_vni_del(struct vxlan_dev *vxlan,
628 struct vxlan_vni_group *vg,
629 u32 vni, struct netlink_ext_ack *extack)
631 struct vxlan_vni_node *vninode;
632 __be32 v = cpu_to_be32(vni);
635 vg = rtnl_dereference(vxlan->vnigrp);
637 vninode = rhashtable_lookup_fast(&vg->vni_hash, &v,
638 vxlan_vni_rht_params);
644 vxlan_vni_delete_group(vxlan, vninode);
646 err = rhashtable_remove_fast(&vg->vni_hash,
648 vxlan_vni_rht_params);
652 __vxlan_vni_del_list(vg, vninode);
654 vxlan_vnifilter_notify(vxlan, vninode, RTM_DELTUNNEL);
656 if (vxlan->dev->flags & IFF_UP)
657 vxlan_vs_add_del_vninode(vxlan, vninode, true);
659 call_rcu(&vninode->rcu, vxlan_vni_node_rcu_free);
666 static int vxlan_vni_add_del(struct vxlan_dev *vxlan, __u32 start_vni,
667 __u32 end_vni, union vxlan_addr *group,
668 int cmd, struct netlink_ext_ack *extack)
670 struct vxlan_vni_group *vg;
673 vg = rtnl_dereference(vxlan->vnigrp);
675 for (v = start_vni; v <= end_vni; v++) {
678 err = vxlan_vni_add(vxlan, vg, v, group, extack);
681 err = vxlan_vni_del(vxlan, vg, v, extack);
696 static int vxlan_process_vni_filter(struct vxlan_dev *vxlan,
697 struct nlattr *nlvnifilter,
698 int cmd, struct netlink_ext_ack *extack)
700 struct nlattr *vattrs[VXLAN_VNIFILTER_ENTRY_MAX + 1];
701 u32 vni_start = 0, vni_end = 0;
702 union vxlan_addr group;
705 err = nla_parse_nested(vattrs,
706 VXLAN_VNIFILTER_ENTRY_MAX,
707 nlvnifilter, vni_filter_entry_policy,
712 if (vattrs[VXLAN_VNIFILTER_ENTRY_START]) {
713 vni_start = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_START]);
717 if (vattrs[VXLAN_VNIFILTER_ENTRY_END])
718 vni_end = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_END]);
720 if (!vni_start && !vni_end) {
721 NL_SET_ERR_MSG_ATTR(extack, nlvnifilter,
722 "vni start nor end found in vni entry");
726 if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]) {
727 group.sin.sin_addr.s_addr =
728 nla_get_in_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]);
729 group.sa.sa_family = AF_INET;
730 } else if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]) {
731 group.sin6.sin6_addr =
732 nla_get_in6_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]);
733 group.sa.sa_family = AF_INET6;
735 memset(&group, 0, sizeof(group));
738 if (vxlan_addr_multicast(&group) && !vxlan->default_dst.remote_ifindex) {
739 NL_SET_ERR_MSG(extack,
740 "Local interface required for multicast remote group");
745 err = vxlan_vni_add_del(vxlan, vni_start, vni_end, &group, cmd,
753 void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan)
755 struct vxlan_vni_node *v, *tmp;
756 struct vxlan_vni_group *vg;
758 vg = rtnl_dereference(vxlan->vnigrp);
759 list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
760 rhashtable_remove_fast(&vg->vni_hash, &v->vnode,
761 vxlan_vni_rht_params);
762 hlist_del_init_rcu(&v->hlist4.hlist);
763 #if IS_ENABLED(CONFIG_IPV6)
764 hlist_del_init_rcu(&v->hlist6.hlist);
766 __vxlan_vni_del_list(vg, v);
767 vxlan_vnifilter_notify(vxlan, v, RTM_DELTUNNEL);
768 call_rcu(&v->rcu, vxlan_vni_node_rcu_free);
770 rhashtable_destroy(&vg->vni_hash);
774 int vxlan_vnigroup_init(struct vxlan_dev *vxlan)
776 struct vxlan_vni_group *vg;
779 vg = kzalloc(sizeof(*vg), GFP_KERNEL);
782 ret = rhashtable_init(&vg->vni_hash, &vxlan_vni_rht_params);
787 INIT_LIST_HEAD(&vg->vni_list);
788 rcu_assign_pointer(vxlan->vnigrp, vg);
793 static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh,
794 struct netlink_ext_ack *extack)
796 struct net *net = sock_net(skb->sk);
797 struct tunnel_msg *tmsg;
798 struct vxlan_dev *vxlan;
799 struct net_device *dev;
804 /* this should validate the header and check for remaining bytes */
805 err = nlmsg_parse(nlh, sizeof(*tmsg), NULL, VXLAN_VNIFILTER_MAX,
806 vni_filter_policy, extack);
810 tmsg = nlmsg_data(nlh);
811 dev = __dev_get_by_index(net, tmsg->ifindex);
815 if (!netif_is_vxlan(dev)) {
816 NL_SET_ERR_MSG_MOD(extack, "The device is not a vxlan device");
820 vxlan = netdev_priv(dev);
822 if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
825 nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) {
826 switch (nla_type(attr)) {
827 case VXLAN_VNIFILTER_ENTRY:
828 err = vxlan_process_vni_filter(vxlan, attr,
829 nlh->nlmsg_type, extack);
840 NL_SET_ERR_MSG_MOD(extack, "No vnis found to process");
847 void vxlan_vnifilter_init(void)
849 rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETTUNNEL, NULL,
850 vxlan_vnifilter_dump, 0);
851 rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWTUNNEL,
852 vxlan_vnifilter_process, NULL, 0);
853 rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELTUNNEL,
854 vxlan_vnifilter_process, NULL, 0);
857 void vxlan_vnifilter_uninit(void)
859 rtnl_unregister(PF_BRIDGE, RTM_GETTUNNEL);
860 rtnl_unregister(PF_BRIDGE, RTM_NEWTUNNEL);
861 rtnl_unregister(PF_BRIDGE, RTM_DELTUNNEL);