RDMA/nldev: Add support for RDMA monitoring
authorChiara Meiohas <cmeiohas@nvidia.com>
Mon, 9 Sep 2024 17:30:24 +0000 (20:30 +0300)
committerLeon Romanovsky <leon@kernel.org>
Fri, 13 Sep 2024 05:29:14 +0000 (08:29 +0300)
Introduce a new netlink command to allow rdma event monitoring.
The rdma events supported now are IB device
registration/unregistration and net device attachment/detachment.

Example output of rdma monitor and the commands which trigger
the events:

$ rdma monitor
$ rmmod mlx5_ib
[UNREGISTER] dev 1 rocep8s0f1
[UNREGISTER] dev 0 rocep8s0f0

$ modprobe mlx5_ib
[REGISTER] dev 2 mlx5_0
[NETDEV_ATTACH] dev 2 mlx5_0 port 1 netdev 4 eth2
[REGISTER] dev 3 mlx5_1
[NETDEV_ATTACH] dev 3 mlx5_1 port 1 netdev 5 eth3

$ devlink dev eswitch set pci/0000:08:00.0 mode switchdev
[UNREGISTER] dev 2 rocep8s0f0
[REGISTER] dev 4 mlx5_0
[NETDEV_ATTACH] dev 4 mlx5_0 port 30 netdev 4 eth2

$ echo 4 > /sys/class/net/eth2/device/sriov_numvfs
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 2 netdev 7 eth4
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 3 netdev 8 eth5
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 4 netdev 9 eth6
[NETDEV_ATTACH] dev 4 rdmap8s0f0 port 5 netdev 10 eth7
[REGISTER] dev 5 mlx5_0
[NETDEV_ATTACH] dev 5 mlx5_0 port 1 netdev 11 eth8
[REGISTER] dev 6 mlx5_0
[NETDEV_ATTACH] dev 6 mlx5_0 port 1 netdev 12 eth9
[REGISTER] dev 7 mlx5_0
[NETDEV_ATTACH] dev 7 mlx5_0 port 1 netdev 13 eth10
[REGISTER] dev 8 mlx5_0
[NETDEV_ATTACH] dev 8 mlx5_0 port 1 netdev 14 eth11

$ echo 0 > /sys/class/net/eth2/device/sriov_numvfs
[UNREGISTER] dev 5 rocep8s0f0v0
[UNREGISTER] dev 6 rocep8s0f0v1
[UNREGISTER] dev 7 rocep8s0f0v2
[UNREGISTER] dev 8 rocep8s0f0v3
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 2
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 3
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 4
[NETDEV_DETACH] dev 4 rdmap8s0f0 port 5

Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909173025.30422-7-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
drivers/infiniband/core/device.c
drivers/infiniband/core/netlink.c
drivers/infiniband/core/nldev.c
include/rdma/rdma_netlink.h
include/uapi/rdma/rdma_netlink.h

index 9e765c7..e029401 100644 (file)
@@ -1351,6 +1351,29 @@ static void prevent_dealloc_device(struct ib_device *ib_dev)
 {
 }
 
+static void ib_device_notify_register(struct ib_device *device)
+{
+       struct net_device *netdev;
+       u32 port;
+       int ret;
+
+       ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
+       if (ret)
+               return;
+
+       rdma_for_each_port(device, port) {
+               netdev = ib_device_get_netdev(device, port);
+               if (!netdev)
+                       continue;
+
+               ret = rdma_nl_notify_event(device, port,
+                                          RDMA_NETDEV_ATTACH_EVENT);
+               dev_put(netdev);
+               if (ret)
+                       return;
+       }
+}
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device: Device to register
@@ -1449,6 +1472,8 @@ int ib_register_device(struct ib_device *device, const char *name,
        dev_set_uevent_suppress(&device->dev, false);
        /* Mark for userspace that device is ready */
        kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+       ib_device_notify_register(device);
        ib_device_put(device);
 
        return 0;
@@ -1491,6 +1516,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
                goto out;
 
        disable_device(ib_dev);
+       rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT);
 
        /* Expedite removing unregistered pointers from the hash table */
        free_netdevs(ib_dev);
@@ -2159,6 +2185,7 @@ static void add_ndev_hash(struct ib_port_data *pdata)
 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
                         u32 port)
 {
+       enum rdma_nl_notify_event_type etype;
        struct net_device *old_ndev;
        struct ib_port_data *pdata;
        unsigned long flags;
@@ -2190,6 +2217,14 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
        spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 
        add_ndev_hash(pdata);
+
+       /* Make sure that the device is registered before we send events */
+       if (xa_load(&devices, ib_dev->index) != ib_dev)
+               return 0;
+
+       etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT;
+       rdma_nl_notify_event(ib_dev, port, etype);
+
        return 0;
 }
 EXPORT_SYMBOL(ib_device_set_netdev);
index ae2db0c..def14c5 100644 (file)
@@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet)
        struct net *net = read_pnet(&rnet->net);
        struct netlink_kernel_cfg cfg = {
                .input  = rdma_nl_rcv,
+               .flags = NL_CFG_F_NONROOT_RECV,
        };
        struct sock *nls;
 
index 4d4a1f9..70b3fa0 100644 (file)
@@ -170,6 +170,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
        [RDMA_NLDEV_ATTR_DEV_TYPE]              = { .type = NLA_U8 },
        [RDMA_NLDEV_ATTR_PARENT_NAME]           = { .type = NLA_NUL_STRING },
        [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE]      = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_EVENT_TYPE]            = { .type = NLA_U8 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -2722,6 +2723,129 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
        },
 };
 
+static int fill_mon_netdev_association(struct sk_buff *msg,
+                                      struct ib_device *device, u32 port,
+                                      const struct net *net)
+{
+       struct net_device *netdev = ib_device_get_netdev(device, port);
+       int ret = 0;
+
+       if (netdev && !net_eq(dev_net(netdev), net))
+               goto out;
+
+       ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index);
+       if (ret)
+               goto out;
+
+       ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME,
+                            dev_name(&device->dev));
+       if (ret)
+               goto out;
+
+       ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port);
+       if (ret)
+               goto out;
+
+       if (netdev) {
+               ret = nla_put_u32(msg,
+                                 RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+               if (ret)
+                       goto out;
+
+               ret = nla_put_string(msg,
+                                    RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+       }
+
+out:
+       dev_put(netdev);
+       return ret;
+}
+
+static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num,
+                                   enum rdma_nl_notify_event_type type)
+{
+       struct net_device *netdev;
+
+       switch (type) {
+       case RDMA_REGISTER_EVENT:
+               dev_warn_ratelimited(&device->dev,
+                                    "Failed to send RDMA monitor register device event\n");
+               break;
+       case RDMA_UNREGISTER_EVENT:
+               dev_warn_ratelimited(&device->dev,
+                                    "Failed to send RDMA monitor unregister device event\n");
+               break;
+       case RDMA_NETDEV_ATTACH_EVENT:
+               netdev = ib_device_get_netdev(device, port_num);
+               dev_warn_ratelimited(&device->dev,
+                                    "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n",
+                                    port_num, netdev->ifindex);
+               dev_put(netdev);
+               break;
+       case RDMA_NETDEV_DETACH_EVENT:
+               dev_warn_ratelimited(&device->dev,
+                                    "Failed to send RDMA monitor netdev detach event: port %d\n",
+                                    port_num);
+       default:
+               break;
+       }
+}
+
+int rdma_nl_notify_event(struct ib_device *device, u32 port_num,
+                         enum rdma_nl_notify_event_type type)
+{
+       struct sk_buff *skb;
+       struct net *net;
+       int ret = 0;
+       void *nlh;
+
+       net = read_pnet(&device->coredev.rdma_net);
+       if (!net)
+               return -EINVAL;
+
+       skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!skb)
+               return -ENOMEM;
+       nlh = nlmsg_put(skb, 0, 0,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR),
+                       0, 0);
+
+       switch (type) {
+       case RDMA_REGISTER_EVENT:
+       case RDMA_UNREGISTER_EVENT:
+               ret = fill_nldev_handle(skb, device);
+               if (ret)
+                       goto err_free;
+               break;
+       case RDMA_NETDEV_ATTACH_EVENT:
+       case RDMA_NETDEV_DETACH_EVENT:
+               ret = fill_mon_netdev_association(skb, device,
+                                                 port_num, net);
+               if (ret)
+                       goto err_free;
+               break;
+       default:
+               break;
+       }
+
+       ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type);
+       if (ret)
+               goto err_free;
+
+       nlmsg_end(skb, nlh);
+       ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL);
+       if (ret && ret != -ESRCH) {
+               skb = NULL; /* skb is freed in the netlink send-op handling */
+               goto err_free;
+       }
+       return 0;
+
+err_free:
+       rdma_nl_notify_err_msg(device, port_num, type);
+       nlmsg_free(skb);
+       return ret;
+}
+
 void __init nldev_init(void)
 {
        rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
index c2a79ae..326deaf 100644 (file)
@@ -6,6 +6,8 @@
 #include <linux/netlink.h>
 #include <uapi/rdma/rdma_netlink.h>
 
+struct ib_device;
+
 enum {
        RDMA_NLDEV_ATTR_EMPTY_STRING = 1,
        RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
@@ -110,6 +112,16 @@ int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
  */
 bool rdma_nl_chk_listeners(unsigned int group);
 
+/**
+ * Prepare and send an event message
+ * @ib: the IB device which triggered the event
+ * @port_num: the port number which triggered the event - 0 if unused
+ * @type: the event type
+ * Returns 0 on success or a negative error code
+ */
+int rdma_nl_notify_event(struct ib_device *ib, u32 port_num,
+                        enum rdma_nl_notify_event_type type);
+
 struct rdma_link_ops {
        struct list_head list;
        const char *type;
index 2f37568..5f9636d 100644 (file)
@@ -15,6 +15,7 @@ enum {
 enum {
        RDMA_NL_GROUP_IWPM = 2,
        RDMA_NL_GROUP_LS,
+       RDMA_NL_GROUP_NOTIFY,
        RDMA_NL_NUM_GROUPS
 };
 
@@ -305,6 +306,8 @@ enum rdma_nldev_command {
 
        RDMA_NLDEV_CMD_DELDEV,
 
+       RDMA_NLDEV_CMD_MONITOR,
+
        RDMA_NLDEV_NUM_OPS
 };
 
@@ -574,6 +577,8 @@ enum rdma_nldev_attr {
 
        RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE,       /* u8 */
 
+       RDMA_NLDEV_ATTR_EVENT_TYPE,             /* u8 */
+
        /*
         * Always the end
         */
@@ -624,4 +629,14 @@ enum rdma_nl_name_assign_type {
        RDMA_NAME_ASSIGN_TYPE_USER = 1, /* Provided by user-space */
 };
 
+/*
+ * Supported rdma monitoring event types.
+ */
+enum rdma_nl_notify_event_type {
+       RDMA_REGISTER_EVENT,
+       RDMA_UNREGISTER_EVENT,
+       RDMA_NETDEV_ATTACH_EVENT,
+       RDMA_NETDEV_DETACH_EVENT,
+};
+
 #endif /* _UAPI_RDMA_NETLINK_H */