2 * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/netdevice.h>
34 #include <net/bonding.h>
35 #include <linux/mlx5/driver.h>
36 #include <linux/mlx5/eswitch.h>
37 #include <linux/mlx5/vport.h>
38 #include "lib/devcom.h"
39 #include "mlx5_core.h"
41 #include "esw/acl/ofld.h"
47 MLX5_LAG_EGRESS_PORT_1 = 1,
48 MLX5_LAG_EGRESS_PORT_2,
51 /* General purpose, use for short periods of time.
52 * Beware of lock dependencies (preferably, no locks should be acquired
55 static DEFINE_SPINLOCK(lag_lock);
57 static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
59 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
60 return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;
62 if (mode == MLX5_LAG_MODE_MPESW)
63 return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;
65 return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
68 static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode,
71 bool fdb_sel_mode = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE,
73 int port_sel_mode = get_port_sel_mode(mode, flags);
74 u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {};
77 lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
78 MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
79 MLX5_SET(lagc, lag_ctx, fdb_selection_mode, fdb_sel_mode);
80 if (port_sel_mode == MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY) {
81 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
82 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
84 MLX5_SET(lagc, lag_ctx, port_select_mode, port_sel_mode);
86 return mlx5_cmd_exec_in(dev, create_lag, in);
89 static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports,
92 u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
93 void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
95 MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
96 MLX5_SET(modify_lag_in, in, field_select, 0x1);
98 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
99 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
101 return mlx5_cmd_exec_in(dev, modify_lag, in);
104 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
106 u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {};
108 MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
110 return mlx5_cmd_exec_in(dev, create_vport_lag, in);
112 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
114 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
116 u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {};
118 MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
120 return mlx5_cmd_exec_in(dev, destroy_vport_lag, in);
122 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
124 static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
125 u8 *ports, int *num_disabled)
130 for (i = 0; i < num_ports; i++) {
131 if (!tracker->netdev_state[i].tx_enabled ||
132 !tracker->netdev_state[i].link_up)
133 ports[(*num_disabled)++] = i;
137 void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
138 u8 *ports, int *num_enabled)
143 for (i = 0; i < num_ports; i++) {
144 if (tracker->netdev_state[i].tx_enabled &&
145 tracker->netdev_state[i].link_up)
146 ports[(*num_enabled)++] = i;
149 if (*num_enabled == 0)
150 mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled);
153 static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev,
154 struct mlx5_lag *ldev,
155 struct lag_tracker *tracker,
158 char buf[MLX5_MAX_PORTS * 10 + 1] = {};
159 u8 enabled_ports[MLX5_MAX_PORTS] = {};
167 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
168 mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports,
170 for (i = 0; i < num_enabled; i++) {
171 err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1);
176 buf[written - 2] = 0;
177 mlx5_core_info(dev, "lag map active ports: %s\n", buf);
179 for (i = 0; i < ldev->ports; i++) {
180 for (j = 0; j < ldev->buckets; j++) {
181 idx = i * ldev->buckets + j;
182 err = scnprintf(buf + written, 10,
183 " port %d:%d", i + 1, ldev->v2p_map[idx]);
189 mlx5_core_info(dev, "lag map:%s\n", buf);
193 static int mlx5_lag_netdev_event(struct notifier_block *this,
194 unsigned long event, void *ptr);
195 static void mlx5_do_bond_work(struct work_struct *work);
197 static void mlx5_ldev_free(struct kref *ref)
199 struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);
201 if (ldev->nb.notifier_call)
202 unregister_netdevice_notifier_net(&init_net, &ldev->nb);
203 mlx5_lag_mp_cleanup(ldev);
204 mlx5_lag_mpesw_cleanup(ldev);
205 cancel_work_sync(&ldev->mpesw_work);
206 destroy_workqueue(ldev->wq);
207 mutex_destroy(&ldev->lock);
211 static void mlx5_ldev_put(struct mlx5_lag *ldev)
213 kref_put(&ldev->ref, mlx5_ldev_free);
216 static void mlx5_ldev_get(struct mlx5_lag *ldev)
218 kref_get(&ldev->ref);
221 static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
223 struct mlx5_lag *ldev;
226 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
230 ldev->wq = create_singlethread_workqueue("mlx5_lag");
236 kref_init(&ldev->ref);
237 mutex_init(&ldev->lock);
238 INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
240 ldev->nb.notifier_call = mlx5_lag_netdev_event;
241 if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
242 ldev->nb.notifier_call = NULL;
243 mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
245 ldev->mode = MLX5_LAG_MODE_NONE;
247 err = mlx5_lag_mp_init(ldev);
249 mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
252 mlx5_lag_mpesw_init(ldev);
253 ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
259 int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
260 struct net_device *ndev)
264 for (i = 0; i < ldev->ports; i++)
265 if (ldev->pf[i].netdev == ndev)
271 static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev)
273 return ldev->mode == MLX5_LAG_MODE_ROCE;
276 static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
278 return ldev->mode == MLX5_LAG_MODE_SRIOV;
281 /* Create a mapping between steering slots and active ports.
282 * As we have ldev->buckets slots per port first assume the native
283 * mapping should be used.
284 * If there are ports that are disabled fill the relevant slots
285 * with mapping that points to active ports.
287 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
292 int disabled[MLX5_MAX_PORTS] = {};
293 int enabled[MLX5_MAX_PORTS] = {};
294 int disabled_ports_num = 0;
295 int enabled_ports_num = 0;
301 for (i = 0; i < num_ports; i++) {
302 if (tracker->netdev_state[i].tx_enabled &&
303 tracker->netdev_state[i].link_up)
304 enabled[enabled_ports_num++] = i;
306 disabled[disabled_ports_num++] = i;
309 /* Use native mapping by default where each port's buckets
310 * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc
312 for (i = 0; i < num_ports; i++)
313 for (j = 0; j < buckets; j++) {
314 idx = i * buckets + j;
315 ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i;
318 /* If all ports are disabled/enabled keep native mapping */
319 if (enabled_ports_num == num_ports ||
320 disabled_ports_num == num_ports)
323 /* Go over the disabled ports and for each assign a random active port */
324 for (i = 0; i < disabled_ports_num; i++) {
325 for (j = 0; j < buckets; j++) {
326 get_random_bytes(&rand, 4);
327 ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1;
332 static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev)
336 for (i = 0; i < ldev->ports; i++)
337 if (ldev->pf[i].has_drop)
342 static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev)
346 for (i = 0; i < ldev->ports; i++) {
347 if (!ldev->pf[i].has_drop)
350 mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch,
352 ldev->pf[i].has_drop = false;
356 static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev,
357 struct lag_tracker *tracker)
359 u8 disabled_ports[MLX5_MAX_PORTS] = {};
360 struct mlx5_core_dev *dev;
366 /* First delete the current drop rule so there won't be any dropped
369 mlx5_lag_drop_rule_cleanup(ldev);
371 if (!ldev->tracker.has_inactive)
374 mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled);
376 for (i = 0; i < num_disabled; i++) {
377 disabled_index = disabled_ports[i];
378 dev = ldev->pf[disabled_index].dev;
379 err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch,
382 ldev->pf[disabled_index].has_drop = true;
385 "Failed to create lag drop rule, error: %d", err);
389 static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
391 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
393 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags))
394 return mlx5_lag_port_sel_modify(ldev, ports);
395 return mlx5_cmd_modify_lag(dev0, ldev->ports, ports);
398 void mlx5_modify_lag(struct mlx5_lag *ldev,
399 struct lag_tracker *tracker)
401 u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {};
402 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
408 mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports);
410 for (i = 0; i < ldev->ports; i++) {
411 for (j = 0; j < ldev->buckets; j++) {
412 idx = i * ldev->buckets + j;
413 if (ports[idx] == ldev->v2p_map[idx])
415 err = _mlx5_modify_lag(ldev, ports);
418 "Failed to modify LAG (%d)\n",
422 memcpy(ldev->v2p_map, ports, sizeof(ports));
424 mlx5_lag_print_mapping(dev0, ldev, tracker,
430 if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
431 !(ldev->mode == MLX5_LAG_MODE_ROCE))
432 mlx5_lag_drop_rule_setup(ldev, tracker);
435 #define MLX5_LAG_ROCE_HASH_PORTS_SUPPORTED 4
436 static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
437 unsigned long *flags)
439 struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
441 if (ldev->ports == MLX5_LAG_ROCE_HASH_PORTS_SUPPORTED) {
442 /* Four ports are support only in hash mode */
443 if (!MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table))
445 set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
447 ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
453 static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
454 struct lag_tracker *tracker,
455 enum mlx5_lag_mode mode,
456 unsigned long *flags)
458 struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
460 if (mode == MLX5_LAG_MODE_MPESW)
463 if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
464 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
465 set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
468 static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
469 struct lag_tracker *tracker, bool shared_fdb,
470 unsigned long *flags)
472 bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
476 set_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, flags);
477 set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
480 if (mode == MLX5_LAG_MODE_MPESW)
481 set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
484 return mlx5_lag_set_port_sel_mode_roce(ldev, flags);
486 mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
490 char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
492 int port_sel_mode = get_port_sel_mode(mode, flags);
494 switch (port_sel_mode) {
495 case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
496 case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
497 case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
498 default: return "invalid";
502 static int mlx5_create_lag(struct mlx5_lag *ldev,
503 struct lag_tracker *tracker,
504 enum mlx5_lag_mode mode,
507 bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
508 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
509 struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
510 u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
514 mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
515 mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
516 shared_fdb, mlx5_get_str_port_sel_mode(mode, flags));
518 err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
521 "Failed to create LAG (%d)\n",
527 err = mlx5_eswitch_offloads_config_single_fdb(dev0->priv.eswitch,
530 mlx5_core_err(dev0, "Can't enable single FDB mode\n");
532 mlx5_core_info(dev0, "Operation mode is single FDB\n");
536 MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
537 if (mlx5_cmd_exec_in(dev0, destroy_lag, in))
539 "Failed to deactivate RoCE LAG; driver restart required\n");
545 int mlx5_activate_lag(struct mlx5_lag *ldev,
546 struct lag_tracker *tracker,
547 enum mlx5_lag_mode mode,
550 bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
551 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
552 unsigned long flags = 0;
555 err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
559 if (mode != MLX5_LAG_MODE_MPESW) {
560 mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
561 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
562 err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
566 "Failed to create LAG port selection(%d)\n",
573 err = mlx5_create_lag(ldev, tracker, mode, flags);
575 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
576 mlx5_lag_port_sel_destroy(ldev);
579 "Failed to activate RoCE LAG\n");
582 "Failed to activate VF LAG\n"
583 "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
587 if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
589 mlx5_lag_drop_rule_setup(ldev, tracker);
592 ldev->mode_flags = flags;
596 static int mlx5_deactivate_lag(struct mlx5_lag *ldev)
598 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
599 struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
600 u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
601 bool roce_lag = __mlx5_lag_is_roce(ldev);
602 unsigned long flags = ldev->mode_flags;
605 ldev->mode = MLX5_LAG_MODE_NONE;
606 ldev->mode_flags = 0;
607 mlx5_lag_mp_reset(ldev);
609 if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
610 mlx5_eswitch_offloads_destroy_single_fdb(dev0->priv.eswitch,
612 clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
615 MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
616 err = mlx5_cmd_exec_in(dev0, destroy_lag, in);
620 "Failed to deactivate RoCE LAG; driver restart required\n");
623 "Failed to deactivate VF LAG; driver restart required\n"
624 "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
629 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
630 mlx5_lag_port_sel_destroy(ldev);
631 if (mlx5_lag_has_drop_rule(ldev))
632 mlx5_lag_drop_rule_cleanup(ldev);
637 #define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2
638 static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
640 #ifdef CONFIG_MLX5_ESWITCH
645 for (i = 0; i < ldev->ports; i++)
646 if (!ldev->pf[i].dev)
649 #ifdef CONFIG_MLX5_ESWITCH
650 mode = mlx5_eswitch_mode(ldev->pf[MLX5_LAG_P1].dev);
652 if (mode != MLX5_ESWITCH_NONE && mode != MLX5_ESWITCH_OFFLOADS)
655 for (i = 0; i < ldev->ports; i++)
656 if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
659 if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports != MLX5_LAG_OFFLOADS_SUPPORTED_PORTS)
662 for (i = 0; i < ldev->ports; i++)
663 if (mlx5_sriov_is_enabled(ldev->pf[i].dev))
669 static void mlx5_lag_add_devices(struct mlx5_lag *ldev)
673 for (i = 0; i < ldev->ports; i++) {
674 if (!ldev->pf[i].dev)
677 if (ldev->pf[i].dev->priv.flags &
678 MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
681 ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
682 mlx5_rescan_drivers_locked(ldev->pf[i].dev);
686 static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
690 for (i = 0; i < ldev->ports; i++) {
691 if (!ldev->pf[i].dev)
694 if (ldev->pf[i].dev->priv.flags &
695 MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
698 ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
699 mlx5_rescan_drivers_locked(ldev->pf[i].dev);
703 void mlx5_disable_lag(struct mlx5_lag *ldev)
705 bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
706 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
707 struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
712 roce_lag = __mlx5_lag_is_roce(ldev);
715 mlx5_lag_remove_devices(ldev);
716 } else if (roce_lag) {
717 if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
718 dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
719 mlx5_rescan_drivers_locked(dev0);
721 for (i = 1; i < ldev->ports; i++)
722 mlx5_nic_vport_disable_roce(ldev->pf[i].dev);
725 err = mlx5_deactivate_lag(ldev);
729 if (shared_fdb || roce_lag)
730 mlx5_lag_add_devices(ldev);
733 if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
734 mlx5_eswitch_reload_reps(dev0->priv.eswitch);
735 if (!(dev1->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
736 mlx5_eswitch_reload_reps(dev1->priv.eswitch);
740 bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
742 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
743 struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
745 if (is_mdev_switchdev_mode(dev0) &&
746 is_mdev_switchdev_mode(dev1) &&
747 mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) &&
748 mlx5_eswitch_vport_match_metadata_enabled(dev1->priv.eswitch) &&
749 mlx5_devcom_is_paired(dev0->priv.devcom,
750 MLX5_DEVCOM_ESW_OFFLOADS) &&
751 MLX5_CAP_GEN(dev1, lag_native_fdb_selection) &&
752 MLX5_CAP_ESW(dev1, root_ft_on_other_esw) &&
753 MLX5_CAP_ESW(dev0, esw_shared_ingress_acl))
759 static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
761 bool roce_lag = true;
764 for (i = 0; i < ldev->ports; i++)
765 roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev);
767 #ifdef CONFIG_MLX5_ESWITCH
768 for (i = 0; i < ldev->ports; i++)
769 roce_lag = roce_lag &&
770 ldev->pf[i].dev->priv.eswitch->mode == MLX5_ESWITCH_NONE;
776 static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
778 return do_bond && __mlx5_lag_is_active(ldev) &&
779 ldev->mode != MLX5_LAG_MODE_MPESW;
782 static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
784 return !do_bond && __mlx5_lag_is_active(ldev) &&
785 ldev->mode != MLX5_LAG_MODE_MPESW;
788 static void mlx5_do_bond(struct mlx5_lag *ldev)
790 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
791 struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
792 struct lag_tracker tracker = { };
793 bool do_bond, roce_lag;
797 if (!mlx5_lag_is_ready(ldev)) {
800 /* VF LAG is in multipath mode, ignore bond change requests */
801 if (mlx5_lag_is_multipath(dev0))
804 tracker = ldev->tracker;
806 do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
809 if (do_bond && !__mlx5_lag_is_active(ldev)) {
810 bool shared_fdb = mlx5_shared_fdb_supported(ldev);
812 roce_lag = mlx5_lag_is_roce_lag(ldev);
814 if (shared_fdb || roce_lag)
815 mlx5_lag_remove_devices(ldev);
817 err = mlx5_activate_lag(ldev, &tracker,
818 roce_lag ? MLX5_LAG_MODE_ROCE :
822 if (shared_fdb || roce_lag)
823 mlx5_lag_add_devices(ldev);
826 } else if (roce_lag) {
827 dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
828 mlx5_rescan_drivers_locked(dev0);
829 for (i = 1; i < ldev->ports; i++)
830 mlx5_nic_vport_enable_roce(ldev->pf[i].dev);
831 } else if (shared_fdb) {
832 dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
833 mlx5_rescan_drivers_locked(dev0);
835 err = mlx5_eswitch_reload_reps(dev0->priv.eswitch);
837 err = mlx5_eswitch_reload_reps(dev1->priv.eswitch);
840 dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
841 mlx5_rescan_drivers_locked(dev0);
842 mlx5_deactivate_lag(ldev);
843 mlx5_lag_add_devices(ldev);
844 mlx5_eswitch_reload_reps(dev0->priv.eswitch);
845 mlx5_eswitch_reload_reps(dev1->priv.eswitch);
846 mlx5_core_err(dev0, "Failed to enable lag\n");
850 } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
851 mlx5_modify_lag(ldev, &tracker);
852 } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
853 mlx5_disable_lag(ldev);
857 static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
859 queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
862 static void mlx5_do_bond_work(struct work_struct *work)
864 struct delayed_work *delayed_work = to_delayed_work(work);
865 struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
869 status = mlx5_dev_list_trylock();
871 mlx5_queue_bond_work(ldev, HZ);
875 mutex_lock(&ldev->lock);
876 if (ldev->mode_changes_in_progress) {
877 mutex_unlock(&ldev->lock);
878 mlx5_dev_list_unlock();
879 mlx5_queue_bond_work(ldev, HZ);
884 mutex_unlock(&ldev->lock);
885 mlx5_dev_list_unlock();
888 static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
889 struct lag_tracker *tracker,
890 struct netdev_notifier_changeupper_info *info)
892 struct net_device *upper = info->upper_dev, *ndev_tmp;
893 struct netdev_lag_upper_info *lag_upper_info = NULL;
894 bool is_bonded, is_in_lag, mode_supported;
895 bool has_inactive = 0;
902 if (!netif_is_lag_master(upper))
906 lag_upper_info = info->upper_info;
908 /* The event may still be of interest if the slave does not belong to
909 * us, but is enslaved to a master which has one or more of our netdevs
910 * as slaves (e.g., if a new slave is added to a master that bonds two
911 * of our netdevs, we should unbond).
914 for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
915 idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
917 slave = bond_slave_get_rcu(ndev_tmp);
919 has_inactive |= bond_is_slave_inactive(slave);
920 bond_status |= (1 << idx);
927 /* None of this lagdev's netdevs are slaves of this master. */
928 if (!(bond_status & GENMASK(ldev->ports - 1, 0)))
931 if (lag_upper_info) {
932 tracker->tx_type = lag_upper_info->tx_type;
933 tracker->hash_type = lag_upper_info->hash_type;
936 tracker->has_inactive = has_inactive;
937 /* Determine bonding status:
938 * A device is considered bonded if both its physical ports are slaves
939 * of the same lag master, and only them.
941 is_in_lag = num_slaves == ldev->ports &&
942 bond_status == GENMASK(ldev->ports - 1, 0);
944 /* Lag mode must be activebackup or hash. */
945 mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP ||
946 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH;
948 is_bonded = is_in_lag && mode_supported;
949 if (tracker->is_bonded != is_bonded) {
950 tracker->is_bonded = is_bonded;
957 if (!mlx5_lag_is_ready(ldev))
958 NL_SET_ERR_MSG_MOD(info->info.extack,
959 "Can't activate LAG offload, PF is configured with more than 64 VFs");
960 else if (!mode_supported)
961 NL_SET_ERR_MSG_MOD(info->info.extack,
962 "Can't activate LAG offload, TX type isn't supported");
967 static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
968 struct lag_tracker *tracker,
969 struct net_device *ndev,
970 struct netdev_notifier_changelowerstate_info *info)
972 struct netdev_lag_lower_state_info *lag_lower_info;
975 if (!netif_is_lag_port(ndev))
978 idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
982 /* This information is used to determine virtual to physical
985 lag_lower_info = info->lower_state_info;
989 tracker->netdev_state[idx] = *lag_lower_info;
994 static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
995 struct lag_tracker *tracker,
996 struct net_device *ndev)
998 struct net_device *ndev_tmp;
1000 bool has_inactive = 0;
1003 if (!netif_is_lag_master(ndev))
1007 for_each_netdev_in_bond_rcu(ndev, ndev_tmp) {
1008 idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
1012 slave = bond_slave_get_rcu(ndev_tmp);
1014 has_inactive |= bond_is_slave_inactive(slave);
1018 if (tracker->has_inactive == has_inactive)
1021 tracker->has_inactive = has_inactive;
1026 /* this handler is always registered to netdev events */
1027 static int mlx5_lag_netdev_event(struct notifier_block *this,
1028 unsigned long event, void *ptr)
1030 struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
1031 struct lag_tracker tracker;
1032 struct mlx5_lag *ldev;
1035 if (event != NETDEV_CHANGEUPPER &&
1036 event != NETDEV_CHANGELOWERSTATE &&
1037 event != NETDEV_CHANGEINFODATA)
1040 ldev = container_of(this, struct mlx5_lag, nb);
1042 tracker = ldev->tracker;
1045 case NETDEV_CHANGEUPPER:
1046 changed = mlx5_handle_changeupper_event(ldev, &tracker, ptr);
1048 case NETDEV_CHANGELOWERSTATE:
1049 changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
1052 case NETDEV_CHANGEINFODATA:
1053 changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev);
1057 ldev->tracker = tracker;
1060 mlx5_queue_bond_work(ldev, 0);
1065 static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
1066 struct mlx5_core_dev *dev,
1067 struct net_device *netdev)
1069 unsigned int fn = mlx5_get_dev_index(dev);
1071 if (fn >= ldev->ports)
1074 spin_lock(&lag_lock);
1075 ldev->pf[fn].netdev = netdev;
1076 ldev->tracker.netdev_state[fn].link_up = 0;
1077 ldev->tracker.netdev_state[fn].tx_enabled = 0;
1078 spin_unlock(&lag_lock);
1081 static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
1082 struct net_device *netdev)
1086 spin_lock(&lag_lock);
1087 for (i = 0; i < ldev->ports; i++) {
1088 if (ldev->pf[i].netdev == netdev) {
1089 ldev->pf[i].netdev = NULL;
1093 spin_unlock(&lag_lock);
1096 static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
1097 struct mlx5_core_dev *dev)
1099 unsigned int fn = mlx5_get_dev_index(dev);
1101 if (fn >= ldev->ports)
1104 ldev->pf[fn].dev = dev;
1105 dev->priv.lag = ldev;
1108 static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
1109 struct mlx5_core_dev *dev)
1113 for (i = 0; i < ldev->ports; i++)
1114 if (ldev->pf[i].dev == dev)
1117 if (i == ldev->ports)
1120 ldev->pf[i].dev = NULL;
1121 dev->priv.lag = NULL;
1124 /* Must be called with intf_mutex held */
1125 static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
1127 struct mlx5_lag *ldev = NULL;
1128 struct mlx5_core_dev *tmp_dev;
1130 tmp_dev = mlx5_get_next_phys_dev_lag(dev);
1132 ldev = tmp_dev->priv.lag;
1135 ldev = mlx5_lag_dev_alloc(dev);
1137 mlx5_core_err(dev, "Failed to alloc lag dev\n");
1140 mlx5_ldev_add_mdev(ldev, dev);
1144 mutex_lock(&ldev->lock);
1145 if (ldev->mode_changes_in_progress) {
1146 mutex_unlock(&ldev->lock);
1149 mlx5_ldev_get(ldev);
1150 mlx5_ldev_add_mdev(ldev, dev);
1151 mutex_unlock(&ldev->lock);
1156 void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
1158 struct mlx5_lag *ldev;
1160 ldev = mlx5_lag_dev(dev);
1164 /* mdev is being removed, might as well remove debugfs
1165 * as early as possible.
1167 mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs);
1169 mutex_lock(&ldev->lock);
1170 if (ldev->mode_changes_in_progress) {
1171 mutex_unlock(&ldev->lock);
1175 mlx5_ldev_remove_mdev(ldev, dev);
1176 mutex_unlock(&ldev->lock);
1177 mlx5_ldev_put(ldev);
1180 void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
1184 if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
1185 !MLX5_CAP_GEN(dev, lag_master) ||
1186 (MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS ||
1187 MLX5_CAP_GEN(dev, num_lag_ports) <= 1))
1191 mlx5_dev_list_lock();
1192 err = __mlx5_lag_dev_add_mdev(dev);
1193 mlx5_dev_list_unlock();
1199 mlx5_ldev_add_debugfs(dev);
1202 void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
1203 struct net_device *netdev)
1205 struct mlx5_lag *ldev;
1208 ldev = mlx5_lag_dev(dev);
1212 mutex_lock(&ldev->lock);
1213 mlx5_ldev_remove_netdev(ldev, netdev);
1214 clear_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1216 lag_is_active = __mlx5_lag_is_active(ldev);
1217 mutex_unlock(&ldev->lock);
1220 mlx5_queue_bond_work(ldev, 0);
1223 void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
1224 struct net_device *netdev)
1226 struct mlx5_lag *ldev;
1229 ldev = mlx5_lag_dev(dev);
1233 mutex_lock(&ldev->lock);
1234 mlx5_ldev_add_netdev(ldev, dev, netdev);
1236 for (i = 0; i < ldev->ports; i++)
1237 if (!ldev->pf[i].dev)
1240 if (i >= ldev->ports)
1241 set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1242 mutex_unlock(&ldev->lock);
1243 mlx5_queue_bond_work(ldev, 0);
1246 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
1248 struct mlx5_lag *ldev;
1251 spin_lock(&lag_lock);
1252 ldev = mlx5_lag_dev(dev);
1253 res = ldev && __mlx5_lag_is_roce(ldev);
1254 spin_unlock(&lag_lock);
1258 EXPORT_SYMBOL(mlx5_lag_is_roce);
1260 bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
1262 struct mlx5_lag *ldev;
1265 spin_lock(&lag_lock);
1266 ldev = mlx5_lag_dev(dev);
1267 res = ldev && __mlx5_lag_is_active(ldev);
1268 spin_unlock(&lag_lock);
1272 EXPORT_SYMBOL(mlx5_lag_is_active);
1274 bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
1276 struct mlx5_lag *ldev;
1279 spin_lock(&lag_lock);
1280 ldev = mlx5_lag_dev(dev);
1281 res = ldev && __mlx5_lag_is_active(ldev) &&
1282 dev == ldev->pf[MLX5_LAG_P1].dev;
1283 spin_unlock(&lag_lock);
1287 EXPORT_SYMBOL(mlx5_lag_is_master);
1289 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
1291 struct mlx5_lag *ldev;
1294 spin_lock(&lag_lock);
1295 ldev = mlx5_lag_dev(dev);
1296 res = ldev && __mlx5_lag_is_sriov(ldev);
1297 spin_unlock(&lag_lock);
1301 EXPORT_SYMBOL(mlx5_lag_is_sriov);
1303 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
1305 struct mlx5_lag *ldev;
1308 spin_lock(&lag_lock);
1309 ldev = mlx5_lag_dev(dev);
1310 res = ldev && __mlx5_lag_is_sriov(ldev) &&
1311 test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
1312 spin_unlock(&lag_lock);
1316 EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
1318 void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
1320 struct mlx5_lag *ldev;
1322 ldev = mlx5_lag_dev(dev);
1326 mlx5_dev_list_lock();
1327 mutex_lock(&ldev->lock);
1329 ldev->mode_changes_in_progress++;
1330 if (__mlx5_lag_is_active(ldev))
1331 mlx5_disable_lag(ldev);
1333 mutex_unlock(&ldev->lock);
1334 mlx5_dev_list_unlock();
1337 void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
1339 struct mlx5_lag *ldev;
1341 ldev = mlx5_lag_dev(dev);
1345 mutex_lock(&ldev->lock);
1346 ldev->mode_changes_in_progress--;
1347 mutex_unlock(&ldev->lock);
1348 mlx5_queue_bond_work(ldev, 0);
1351 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
1353 struct net_device *ndev = NULL;
1354 struct mlx5_lag *ldev;
1357 spin_lock(&lag_lock);
1358 ldev = mlx5_lag_dev(dev);
1360 if (!(ldev && __mlx5_lag_is_roce(ldev)))
1363 if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
1364 for (i = 0; i < ldev->ports; i++)
1365 if (ldev->tracker.netdev_state[i].tx_enabled)
1366 ndev = ldev->pf[i].netdev;
1368 ndev = ldev->pf[ldev->ports - 1].netdev;
1370 ndev = ldev->pf[MLX5_LAG_P1].netdev;
1376 spin_unlock(&lag_lock);
1380 EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
1382 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
1383 struct net_device *slave)
1385 struct mlx5_lag *ldev;
1389 spin_lock(&lag_lock);
1390 ldev = mlx5_lag_dev(dev);
1391 if (!(ldev && __mlx5_lag_is_roce(ldev)))
1394 for (i = 0; i < ldev->ports; i++) {
1395 if (ldev->pf[MLX5_LAG_P1].netdev == slave) {
1401 port = ldev->v2p_map[port * ldev->buckets];
1404 spin_unlock(&lag_lock);
1407 EXPORT_SYMBOL(mlx5_lag_get_slave_port);
1409 u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev)
1411 struct mlx5_lag *ldev;
1413 ldev = mlx5_lag_dev(dev);
1419 EXPORT_SYMBOL(mlx5_lag_get_num_ports);
1421 struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev)
1423 struct mlx5_core_dev *peer_dev = NULL;
1424 struct mlx5_lag *ldev;
1426 spin_lock(&lag_lock);
1427 ldev = mlx5_lag_dev(dev);
1431 peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ?
1432 ldev->pf[MLX5_LAG_P2].dev :
1433 ldev->pf[MLX5_LAG_P1].dev;
1436 spin_unlock(&lag_lock);
1439 EXPORT_SYMBOL(mlx5_lag_get_peer_mdev);
1441 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
1446 int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
1447 struct mlx5_core_dev **mdev;
1448 struct mlx5_lag *ldev;
1453 out = kvzalloc(outlen, GFP_KERNEL);
1457 mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL);
1463 memset(values, 0, sizeof(*values) * num_counters);
1465 spin_lock(&lag_lock);
1466 ldev = mlx5_lag_dev(dev);
1467 if (ldev && __mlx5_lag_is_active(ldev)) {
1468 num_ports = ldev->ports;
1469 for (i = 0; i < ldev->ports; i++)
1470 mdev[i] = ldev->pf[i].dev;
1473 mdev[MLX5_LAG_P1] = dev;
1475 spin_unlock(&lag_lock);
1477 for (i = 0; i < num_ports; ++i) {
1478 u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {};
1480 MLX5_SET(query_cong_statistics_in, in, opcode,
1481 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
1482 ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in,
1487 for (j = 0; j < num_counters; ++j)
1488 values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
1497 EXPORT_SYMBOL(mlx5_lag_query_cong_counters);