nexthop: Add support for IPv4 nexthops
authorDavid Ahern <dsahern@gmail.com>
Fri, 24 May 2019 21:43:05 +0000 (14:43 -0700)
committerDavid S. Miller <davem@davemloft.net>
Wed, 29 May 2019 04:37:30 +0000 (21:37 -0700)
Add support for IPv4 nexthops. If nh_family is set to AF_INET, then
NHA_GATEWAY is expected to be an IPv4 address.

Register for netdev events to be notified of admin up/down changes as
well as deletes. A hash table is used to track nexthop per devices to
quickly convert device events to the affected nexthops.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/nexthop.h
net/ipv4/nexthop.c

index 18e1f51..c0e4b0d 100644 (file)
@@ -29,6 +29,10 @@ struct nh_config {
        int             nh_ifindex;
        struct net_device *dev;
 
+       union {
+               __be32          ipv4;
+       } gw;
+
        u32             nlflags;
        struct nl_info  nlinfo;
 };
@@ -42,6 +46,7 @@ struct nh_info {
 
        union {
                struct fib_nh_common    fib_nhc;
+               struct fib_nh           fib_nh;
        };
 };
 
index ec0ccf2..79c7b34 100644 (file)
@@ -9,8 +9,12 @@
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
 #include <net/nexthop.h>
+#include <net/route.h>
 #include <net/sock.h>
 
+#define NH_DEV_HASHBITS  8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
 static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
        [NHA_UNSPEC]            = { .strict_start_type = NHA_UNSPEC + 1 },
        [NHA_ID]                = { .type = NLA_U32 },
@@ -25,12 +29,39 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
        [NHA_MASTER]            = { .type = NLA_U32 },
 };
 
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+       unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+       return (val ^
+               (val >> NH_DEV_HASHBITS) ^
+               (val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+       struct net_device *dev = nhi->fib_nhc.nhc_dev;
+       struct hlist_head *head;
+       unsigned int hash;
+
+       WARN_ON(!dev);
+
+       hash = nh_dev_hashfn(dev->ifindex);
+       head = &net->nexthop.devhash[hash];
+       hlist_add_head(&nhi->dev_hash, head);
+}
+
 void nexthop_free_rcu(struct rcu_head *head)
 {
        struct nexthop *nh = container_of(head, struct nexthop, rcu);
        struct nh_info *nhi;
 
        nhi = rcu_dereference_raw(nh->nh_info);
+       switch (nhi->family) {
+       case AF_INET:
+               fib_nh_release(nh->net, &nhi->fib_nh);
+               break;
+       }
        kfree(nhi);
 
        kfree(nh);
@@ -96,6 +127,7 @@ static u32 nh_find_unused_id(struct net *net)
 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
                        int event, u32 portid, u32 seq, unsigned int nlflags)
 {
+       struct fib_nh *fib_nh;
        struct nlmsghdr *nlh;
        struct nh_info *nhi;
        struct nhmsg *nhm;
@@ -120,6 +152,22 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
                if (nla_put_flag(skb, NHA_BLACKHOLE))
                        goto nla_put_failure;
                goto out;
+       } else {
+               const struct net_device *dev;
+
+               dev = nhi->fib_nhc.nhc_dev;
+               if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+                       goto nla_put_failure;
+       }
+
+       nhm->nh_scope = nhi->fib_nhc.nhc_scope;
+       switch (nhi->family) {
+       case AF_INET:
+               fib_nh = &nhi->fib_nh;
+               if (fib_nh->fib_nh_gw_family &&
+                   nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+                       goto nla_put_failure;
+               break;
        }
 
 out:
@@ -132,6 +180,7 @@ nla_put_failure:
 
 static size_t nh_nlmsg_size(struct nexthop *nh)
 {
+       struct nh_info *nhi = rtnl_dereference(nh->nh_info);
        size_t sz = nla_total_size(4);    /* NHA_ID */
 
        /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
@@ -139,6 +188,13 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
         */
        sz += nla_total_size(4);  /* NHA_OIF */
 
+       switch (nhi->family) {
+       case AF_INET:
+               if (nhi->fib_nh.fib_nh_gw_family)
+                       sz += nla_total_size(4);  /* NHA_GATEWAY */
+               break;
+       }
+
        return sz;
 }
 
@@ -169,6 +225,15 @@ errout:
                rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 }
 
+static void __remove_nexthop(struct net *net, struct nexthop *nh)
+{
+       struct nh_info *nhi;
+
+       nhi = rtnl_dereference(nh->nh_info);
+       if (nhi->fib_nhc.nhc_dev)
+               hlist_del(&nhi->dev_hash);
+}
+
 static void remove_nexthop(struct net *net, struct nexthop *nh,
                           bool skip_fib, struct nl_info *nlinfo)
 {
@@ -178,6 +243,7 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
        if (nlinfo)
                nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 
+       __remove_nexthop(net, nh);
        nh_base_seq_inc(net);
 
        nexthop_put(nh);
@@ -244,6 +310,24 @@ out:
        return rc;
 }
 
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev)
+{
+       unsigned int hash = nh_dev_hashfn(dev->ifindex);
+       struct net *net = dev_net(dev);
+       struct hlist_head *head = &net->nexthop.devhash[hash];
+       struct hlist_node *n;
+       struct nh_info *nhi;
+
+       hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+               if (nhi->fib_nhc.nhc_dev != dev)
+                       continue;
+
+               remove_nexthop(net, nhi->nh_parent, false, NULL);
+       }
+}
+
 /* rtnl; called when net namespace is deleted */
 static void flush_all_nexthops(struct net *net)
 {
@@ -258,6 +342,38 @@ static void flush_all_nexthops(struct net *net)
        }
 }
 
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+                         struct nh_info *nhi, struct nh_config *cfg,
+                         struct netlink_ext_ack *extack)
+{
+       struct fib_nh *fib_nh = &nhi->fib_nh;
+       struct fib_config fib_cfg = {
+               .fc_oif   = cfg->nh_ifindex,
+               .fc_gw4   = cfg->gw.ipv4,
+               .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
+               .fc_flags = cfg->nh_flags,
+       };
+       u32 tb_id = l3mdev_fib_table(cfg->dev);
+       int err = -EINVAL;
+
+       err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+       if (err) {
+               fib_nh_release(net, fib_nh);
+               goto out;
+       }
+
+       /* sets nh_dev if successful */
+       err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
+       if (!err) {
+               nh->nh_flags = fib_nh->fib_nh_flags;
+               fib_info_update_nh_saddr(net, fib_nh, fib_nh->fib_nh_scope);
+       } else {
+               fib_nh_release(net, fib_nh);
+       }
+out:
+       return err;
+}
+
 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
                                      struct netlink_ext_ack *extack)
 {
@@ -287,12 +403,21 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
                cfg->nh_ifindex = net->loopback_dev->ifindex;
        }
 
+       switch (cfg->nh_family) {
+       case AF_INET:
+               err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+               break;
+       }
+
        if (err) {
                kfree(nhi);
                kfree(nh);
                return ERR_PTR(err);
        }
 
+       /* add the entry to the device based hash */
+       nexthop_devhash_add(net, nhi);
+
        rcu_assign_pointer(nh->nh_info, nhi);
 
        return nh;
@@ -329,6 +454,7 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 
        err = insert_nexthop(net, nh, cfg, extack);
        if (err) {
+               __remove_nexthop(net, nh);
                nexthop_put(nh);
                nh = ERR_PTR(err);
        }
@@ -360,6 +486,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
        }
 
        switch (nhm->nh_family) {
+       case AF_INET:
+               break;
        default:
                NL_SET_ERR_MSG(extack, "Invalid address family");
                goto out;
@@ -416,6 +544,32 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
                goto out;
        }
 
+       err = -EINVAL;
+       if (tb[NHA_GATEWAY]) {
+               struct nlattr *gwa = tb[NHA_GATEWAY];
+
+               switch (cfg->nh_family) {
+               case AF_INET:
+                       if (nla_len(gwa) != sizeof(u32)) {
+                               NL_SET_ERR_MSG(extack, "Invalid gateway");
+                               goto out;
+                       }
+                       cfg->gw.ipv4 = nla_get_be32(gwa);
+                       break;
+               default:
+                       NL_SET_ERR_MSG(extack,
+                                      "Unknown address family for gateway");
+                       goto out;
+               }
+       } else {
+               /* device only nexthop (no gateway) */
+               if (cfg->nh_flags & RTNH_F_ONLINK) {
+                       NL_SET_ERR_MSG(extack,
+                                      "ONLINK flag can not be set for nexthop without a gateway");
+                       goto out;
+               }
+       }
+
        err = 0;
 out:
        return err;
@@ -683,16 +837,68 @@ out_err:
        return err;
 }
 
+static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+       unsigned int hash = nh_dev_hashfn(dev->ifindex);
+       struct net *net = dev_net(dev);
+       struct hlist_head *head = &net->nexthop.devhash[hash];
+       struct hlist_node *n;
+       struct nh_info *nhi;
+
+       hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+               if (nhi->fib_nhc.nhc_dev == dev) {
+                       if (nhi->family == AF_INET)
+                               fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
+                                                  orig_mtu);
+               }
+       }
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+                          unsigned long event, void *ptr)
+{
+       struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+       struct netdev_notifier_info_ext *info_ext;
+
+       switch (event) {
+       case NETDEV_DOWN:
+       case NETDEV_UNREGISTER:
+               nexthop_flush_dev(dev);
+               break;
+       case NETDEV_CHANGE:
+               if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+                       nexthop_flush_dev(dev);
+               break;
+       case NETDEV_CHANGEMTU:
+               info_ext = ptr;
+               nexthop_sync_mtu(dev, info_ext->ext.mtu);
+               rt_cache_flush(dev_net(dev));
+               break;
+       }
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+       .notifier_call = nh_netdev_event,
+};
+
 static void __net_exit nexthop_net_exit(struct net *net)
 {
        rtnl_lock();
        flush_all_nexthops(net);
        rtnl_unlock();
+       kfree(net->nexthop.devhash);
 }
 
 static int __net_init nexthop_net_init(struct net *net)
 {
+       size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
        net->nexthop.rb_root = RB_ROOT;
+       net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+       if (!net->nexthop.devhash)
+               return -ENOMEM;
 
        return 0;
 }
@@ -706,6 +912,8 @@ static int __init nexthop_init(void)
 {
        register_pernet_subsys(&nexthop_net_ops);
 
+       register_netdevice_notifier(&nh_netdev_notifier);
+
        rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,