1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */
5 #include "diag/fw_tracer.h"
9 MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
10 MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
11 MLX5_FW_RESET_FLAGS_PENDING_COMP
14 struct mlx5_fw_reset {
15 struct mlx5_core_dev *dev;
17 struct workqueue_struct *wq;
18 struct work_struct fw_live_patch_work;
19 struct work_struct reset_request_work;
20 struct work_struct reset_reload_work;
21 struct work_struct reset_now_work;
22 struct work_struct reset_abort_work;
23 unsigned long reset_flags;
24 struct timer_list timer;
25 struct completion done;
29 void mlx5_fw_reset_enable_remote_dev_reset_set(struct mlx5_core_dev *dev, bool enable)
31 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
34 clear_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
36 set_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
39 bool mlx5_fw_reset_enable_remote_dev_reset_get(struct mlx5_core_dev *dev)
41 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
43 return !test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
46 static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level,
47 u8 reset_type_sel, u8 sync_resp, bool sync_start)
49 u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
50 u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
52 MLX5_SET(mfrl_reg, in, reset_level, reset_level);
53 MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel);
54 MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_resp, sync_resp);
55 MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, sync_start);
57 return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 1);
60 static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level,
61 u8 *reset_type, u8 *reset_state)
63 u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
64 u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
67 err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 0);
72 *reset_level = MLX5_GET(mfrl_reg, out, reset_level);
74 *reset_type = MLX5_GET(mfrl_reg, out, reset_type);
76 *reset_state = MLX5_GET(mfrl_reg, out, reset_state);
81 int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type)
83 return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL);
86 static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev,
87 struct netlink_ext_ack *extack)
91 if (mlx5_reg_mfrl_query(dev, NULL, NULL, &reset_state))
94 switch (reset_state) {
95 case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION:
96 case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS:
97 NL_SET_ERR_MSG_MOD(extack, "Sync reset was already triggered");
99 case MLX5_MFRL_REG_RESET_STATE_TIMEOUT:
100 NL_SET_ERR_MSG_MOD(extack, "Sync reset got timeout");
102 case MLX5_MFRL_REG_RESET_STATE_NACK:
103 NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset");
108 NL_SET_ERR_MSG_MOD(extack, "Sync reset failed");
112 int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
113 struct netlink_ext_ack *extack)
115 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
116 u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
117 u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
120 set_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
122 MLX5_SET(mfrl_reg, in, reset_level, MLX5_MFRL_REG_RESET_LEVEL3);
123 MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel);
124 MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, 1);
125 err = mlx5_access_reg(dev, in, sizeof(in), out, sizeof(out),
126 MLX5_REG_MFRL, 0, 1, false);
130 clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
131 if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state))
132 return mlx5_fw_reset_get_reset_state_err(dev, extack);
134 NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed");
135 return mlx5_cmd_check(dev, err, in, out);
138 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
140 return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
143 static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
145 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
147 /* if this is the driver that initiated the fw reset, devlink completed the reload */
148 if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
149 complete(&fw_reset->done);
152 devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
153 BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
154 BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
158 static void mlx5_sync_reset_reload_work(struct work_struct *work)
160 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
162 struct mlx5_core_dev *dev = fw_reset->dev;
165 mlx5_enter_error_state(dev, true);
166 mlx5_unload_one(dev);
167 err = mlx5_health_wait_pci_up(dev);
169 mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
171 mlx5_fw_reset_complete_reload(dev);
174 static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
176 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
178 del_timer_sync(&fw_reset->timer);
181 static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health)
183 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
185 mlx5_stop_sync_reset_poll(dev);
186 clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
188 mlx5_start_health_poll(dev);
191 #define MLX5_RESET_POLL_INTERVAL (HZ / 10)
192 static void poll_sync_reset(struct timer_list *t)
194 struct mlx5_fw_reset *fw_reset = from_timer(fw_reset, t, timer);
195 struct mlx5_core_dev *dev = fw_reset->dev;
198 if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags))
201 fatal_error = mlx5_health_check_fatal_sensors(dev);
204 mlx5_core_warn(dev, "Got Device Reset\n");
205 mlx5_sync_reset_clear_reset_requested(dev, false);
206 queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
210 mod_timer(&fw_reset->timer, round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL));
213 static void mlx5_start_sync_reset_poll(struct mlx5_core_dev *dev)
215 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
217 timer_setup(&fw_reset->timer, poll_sync_reset, 0);
218 fw_reset->timer.expires = round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL);
219 add_timer(&fw_reset->timer);
222 static int mlx5_fw_reset_set_reset_sync_ack(struct mlx5_core_dev *dev)
224 return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 1, false);
227 static int mlx5_fw_reset_set_reset_sync_nack(struct mlx5_core_dev *dev)
229 return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 2, false);
232 static void mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev)
234 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
236 mlx5_stop_health_poll(dev, true);
237 set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
238 mlx5_start_sync_reset_poll(dev);
241 static void mlx5_fw_live_patch_event(struct work_struct *work)
243 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
245 struct mlx5_core_dev *dev = fw_reset->dev;
247 mlx5_core_info(dev, "Live patch updated firmware version: %d.%d.%d\n", fw_rev_maj(dev),
248 fw_rev_min(dev), fw_rev_sub(dev));
250 if (mlx5_fw_tracer_reload(dev->tracer))
251 mlx5_core_err(dev, "Failed to reload FW tracer\n");
254 static void mlx5_sync_reset_request_event(struct work_struct *work)
256 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
258 struct mlx5_core_dev *dev = fw_reset->dev;
261 if (test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags)) {
262 err = mlx5_fw_reset_set_reset_sync_nack(dev);
263 mlx5_core_warn(dev, "PCI Sync FW Update Reset Nack %s",
264 err ? "Failed" : "Sent");
267 mlx5_sync_reset_set_reset_requested(dev);
268 err = mlx5_fw_reset_set_reset_sync_ack(dev);
270 mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err);
272 mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n");
275 static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev)
277 struct pci_bus *bridge_bus = dev->pdev->bus;
278 struct pci_dev *bridge = bridge_bus->self;
279 u16 reg16, dev_id, sdev_id;
280 unsigned long timeout;
281 struct pci_dev *sdev;
285 /* Check that all functions under the pci bridge are PFs of
286 * this device otherwise fail this function.
288 err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id);
291 list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
292 err = pci_read_config_word(sdev, PCI_DEVICE_ID, &sdev_id);
295 if (sdev_id != dev_id)
299 cap = pci_find_capability(bridge, PCI_CAP_ID_EXP);
303 list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
304 pci_save_state(sdev);
305 pci_cfg_access_lock(sdev);
307 /* PCI link toggle */
308 err = pci_read_config_word(bridge, cap + PCI_EXP_LNKCTL, ®16);
311 reg16 |= PCI_EXP_LNKCTL_LD;
312 err = pci_write_config_word(bridge, cap + PCI_EXP_LNKCTL, reg16);
316 reg16 &= ~PCI_EXP_LNKCTL_LD;
317 err = pci_write_config_word(bridge, cap + PCI_EXP_LNKCTL, reg16);
322 err = pci_read_config_dword(bridge, cap + PCI_EXP_LNKCAP, ®32);
325 if (!(reg32 & PCI_EXP_LNKCAP_DLLLARC)) {
326 mlx5_core_warn(dev, "No PCI link reporting capability (0x%08x)\n", reg32);
331 timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE));
333 err = pci_read_config_word(bridge, cap + PCI_EXP_LNKSTA, ®16);
336 if (reg16 & PCI_EXP_LNKSTA_DLLLA)
339 } while (!time_after(jiffies, timeout));
341 if (reg16 & PCI_EXP_LNKSTA_DLLLA) {
342 mlx5_core_info(dev, "PCI Link up\n");
344 mlx5_core_err(dev, "PCI link not ready (0x%04x) after %llu ms\n",
345 reg16, mlx5_tout_ms(dev, PCI_TOGGLE));
350 list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
351 pci_cfg_access_unlock(sdev);
352 pci_restore_state(sdev);
358 static void mlx5_sync_reset_now_event(struct work_struct *work)
360 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
362 struct mlx5_core_dev *dev = fw_reset->dev;
365 mlx5_sync_reset_clear_reset_requested(dev, false);
367 mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n");
369 err = mlx5_cmd_fast_teardown_hca(dev);
371 mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err);
375 err = mlx5_pci_link_toggle(dev);
377 mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, no reset done, err %d\n", err);
381 mlx5_enter_error_state(dev, true);
382 mlx5_unload_one(dev);
385 mlx5_fw_reset_complete_reload(dev);
388 static void mlx5_sync_reset_abort_event(struct work_struct *work)
390 struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
392 struct mlx5_core_dev *dev = fw_reset->dev;
394 if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags))
397 mlx5_sync_reset_clear_reset_requested(dev, true);
398 mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n");
401 static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct mlx5_eqe *eqe)
403 struct mlx5_eqe_sync_fw_update *sync_fw_update_eqe;
404 u8 sync_event_rst_type;
406 sync_fw_update_eqe = &eqe->data.sync_fw_update;
407 sync_event_rst_type = sync_fw_update_eqe->sync_rst_state & SYNC_RST_STATE_MASK;
408 switch (sync_event_rst_type) {
409 case MLX5_SYNC_RST_STATE_RESET_REQUEST:
410 queue_work(fw_reset->wq, &fw_reset->reset_request_work);
412 case MLX5_SYNC_RST_STATE_RESET_NOW:
413 queue_work(fw_reset->wq, &fw_reset->reset_now_work);
415 case MLX5_SYNC_RST_STATE_RESET_ABORT:
416 queue_work(fw_reset->wq, &fw_reset->reset_abort_work);
421 static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data)
423 struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb);
424 struct mlx5_eqe *eqe = data;
426 switch (eqe->sub_type) {
427 case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT:
428 queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work);
430 case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT:
431 mlx5_sync_reset_events_handle(fw_reset, eqe);
440 int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev)
442 unsigned long pci_sync_update_timeout = mlx5_tout_ms(dev, PCI_SYNC_UPDATE);
443 unsigned long timeout = msecs_to_jiffies(pci_sync_update_timeout);
444 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
447 if (!wait_for_completion_timeout(&fw_reset->done, timeout)) {
448 mlx5_core_warn(dev, "FW sync reset timeout after %lu seconds\n",
449 pci_sync_update_timeout / 1000);
455 clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
459 void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev)
461 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
463 MLX5_NB_INIT(&fw_reset->nb, fw_reset_event_notifier, GENERAL_EVENT);
464 mlx5_eq_notifier_register(dev, &fw_reset->nb);
467 void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev)
469 mlx5_eq_notifier_unregister(dev, &dev->priv.fw_reset->nb);
472 int mlx5_fw_reset_init(struct mlx5_core_dev *dev)
474 struct mlx5_fw_reset *fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL);
478 fw_reset->wq = create_singlethread_workqueue("mlx5_fw_reset_events");
485 dev->priv.fw_reset = fw_reset;
487 INIT_WORK(&fw_reset->fw_live_patch_work, mlx5_fw_live_patch_event);
488 INIT_WORK(&fw_reset->reset_request_work, mlx5_sync_reset_request_event);
489 INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work);
490 INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event);
491 INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event);
493 init_completion(&fw_reset->done);
497 void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev)
499 struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
501 destroy_workqueue(fw_reset->wq);
502 kfree(dev->priv.fw_reset);