Merge branch 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox...
[linux-2.6-microblaze.git] / drivers / net / ethernet / mellanox / mlx5 / core / fw_reset.c
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020, Mellanox Technologies inc.  All rights reserved. */
3
4 #include "fw_reset.h"
5 #include "diag/fw_tracer.h"
6 #include "lib/tout.h"
7
8 enum {
9         MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
10         MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
11         MLX5_FW_RESET_FLAGS_PENDING_COMP
12 };
13
14 struct mlx5_fw_reset {
15         struct mlx5_core_dev *dev;
16         struct mlx5_nb nb;
17         struct workqueue_struct *wq;
18         struct work_struct fw_live_patch_work;
19         struct work_struct reset_request_work;
20         struct work_struct reset_reload_work;
21         struct work_struct reset_now_work;
22         struct work_struct reset_abort_work;
23         unsigned long reset_flags;
24         struct timer_list timer;
25         struct completion done;
26         int ret;
27 };
28
29 void mlx5_fw_reset_enable_remote_dev_reset_set(struct mlx5_core_dev *dev, bool enable)
30 {
31         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
32
33         if (enable)
34                 clear_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
35         else
36                 set_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
37 }
38
39 bool mlx5_fw_reset_enable_remote_dev_reset_get(struct mlx5_core_dev *dev)
40 {
41         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
42
43         return !test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags);
44 }
45
46 static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level,
47                              u8 reset_type_sel, u8 sync_resp, bool sync_start)
48 {
49         u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
50         u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
51
52         MLX5_SET(mfrl_reg, in, reset_level, reset_level);
53         MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel);
54         MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_resp, sync_resp);
55         MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, sync_start);
56
57         return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 1);
58 }
59
60 static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level,
61                                u8 *reset_type, u8 *reset_state)
62 {
63         u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
64         u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
65         int err;
66
67         err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 0);
68         if (err)
69                 return err;
70
71         if (reset_level)
72                 *reset_level = MLX5_GET(mfrl_reg, out, reset_level);
73         if (reset_type)
74                 *reset_type = MLX5_GET(mfrl_reg, out, reset_type);
75         if (reset_state)
76                 *reset_state = MLX5_GET(mfrl_reg, out, reset_state);
77
78         return 0;
79 }
80
81 int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type)
82 {
83         return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL);
84 }
85
86 static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev,
87                                              struct netlink_ext_ack *extack)
88 {
89         u8 reset_state;
90
91         if (mlx5_reg_mfrl_query(dev, NULL, NULL, &reset_state))
92                 goto out;
93
94         switch (reset_state) {
95         case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION:
96         case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS:
97                 NL_SET_ERR_MSG_MOD(extack, "Sync reset was already triggered");
98                 return -EBUSY;
99         case MLX5_MFRL_REG_RESET_STATE_TIMEOUT:
100                 NL_SET_ERR_MSG_MOD(extack, "Sync reset got timeout");
101                 return -ETIMEDOUT;
102         case MLX5_MFRL_REG_RESET_STATE_NACK:
103                 NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset");
104                 return -EPERM;
105         }
106
107 out:
108         NL_SET_ERR_MSG_MOD(extack, "Sync reset failed");
109         return -EIO;
110 }
111
112 int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
113                                  struct netlink_ext_ack *extack)
114 {
115         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
116         u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
117         u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
118         int err;
119
120         set_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
121
122         MLX5_SET(mfrl_reg, in, reset_level, MLX5_MFRL_REG_RESET_LEVEL3);
123         MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel);
124         MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, 1);
125         err = mlx5_access_reg(dev, in, sizeof(in), out, sizeof(out),
126                               MLX5_REG_MFRL, 0, 1, false);
127         if (!err)
128                 return 0;
129
130         clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
131         if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state))
132                 return mlx5_fw_reset_get_reset_state_err(dev, extack);
133
134         NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed");
135         return mlx5_cmd_check(dev, err, in, out);
136 }
137
138 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
139 {
140         return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
141 }
142
143 static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
144 {
145         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
146
147         /* if this is the driver that initiated the fw reset, devlink completed the reload */
148         if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
149                 complete(&fw_reset->done);
150         } else {
151                 mlx5_load_one(dev);
152                 devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
153                                                         BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
154                                                         BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
155         }
156 }
157
158 static void mlx5_sync_reset_reload_work(struct work_struct *work)
159 {
160         struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
161                                                       reset_reload_work);
162         struct mlx5_core_dev *dev = fw_reset->dev;
163         int err;
164
165         mlx5_enter_error_state(dev, true);
166         mlx5_unload_one(dev);
167         err = mlx5_health_wait_pci_up(dev);
168         if (err)
169                 mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
170         fw_reset->ret = err;
171         mlx5_fw_reset_complete_reload(dev);
172 }
173
174 static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
175 {
176         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
177
178         del_timer_sync(&fw_reset->timer);
179 }
180
181 static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health)
182 {
183         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
184
185         mlx5_stop_sync_reset_poll(dev);
186         clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
187         if (poll_health)
188                 mlx5_start_health_poll(dev);
189 }
190
191 #define MLX5_RESET_POLL_INTERVAL        (HZ / 10)
192 static void poll_sync_reset(struct timer_list *t)
193 {
194         struct mlx5_fw_reset *fw_reset = from_timer(fw_reset, t, timer);
195         struct mlx5_core_dev *dev = fw_reset->dev;
196         u32 fatal_error;
197
198         if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags))
199                 return;
200
201         fatal_error = mlx5_health_check_fatal_sensors(dev);
202
203         if (fatal_error) {
204                 mlx5_core_warn(dev, "Got Device Reset\n");
205                 mlx5_sync_reset_clear_reset_requested(dev, false);
206                 queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
207                 return;
208         }
209
210         mod_timer(&fw_reset->timer, round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL));
211 }
212
213 static void mlx5_start_sync_reset_poll(struct mlx5_core_dev *dev)
214 {
215         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
216
217         timer_setup(&fw_reset->timer, poll_sync_reset, 0);
218         fw_reset->timer.expires = round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL);
219         add_timer(&fw_reset->timer);
220 }
221
222 static int mlx5_fw_reset_set_reset_sync_ack(struct mlx5_core_dev *dev)
223 {
224         return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 1, false);
225 }
226
227 static int mlx5_fw_reset_set_reset_sync_nack(struct mlx5_core_dev *dev)
228 {
229         return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 2, false);
230 }
231
232 static void mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev)
233 {
234         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
235
236         mlx5_stop_health_poll(dev, true);
237         set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
238         mlx5_start_sync_reset_poll(dev);
239 }
240
241 static void mlx5_fw_live_patch_event(struct work_struct *work)
242 {
243         struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
244                                                       fw_live_patch_work);
245         struct mlx5_core_dev *dev = fw_reset->dev;
246
247         mlx5_core_info(dev, "Live patch updated firmware version: %d.%d.%d\n", fw_rev_maj(dev),
248                        fw_rev_min(dev), fw_rev_sub(dev));
249
250         if (mlx5_fw_tracer_reload(dev->tracer))
251                 mlx5_core_err(dev, "Failed to reload FW tracer\n");
252 }
253
254 static void mlx5_sync_reset_request_event(struct work_struct *work)
255 {
256         struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
257                                                       reset_request_work);
258         struct mlx5_core_dev *dev = fw_reset->dev;
259         int err;
260
261         if (test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags)) {
262                 err = mlx5_fw_reset_set_reset_sync_nack(dev);
263                 mlx5_core_warn(dev, "PCI Sync FW Update Reset Nack %s",
264                                err ? "Failed" : "Sent");
265                 return;
266         }
267         mlx5_sync_reset_set_reset_requested(dev);
268         err = mlx5_fw_reset_set_reset_sync_ack(dev);
269         if (err)
270                 mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err);
271         else
272                 mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n");
273 }
274
275 static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev)
276 {
277         struct pci_bus *bridge_bus = dev->pdev->bus;
278         struct pci_dev *bridge = bridge_bus->self;
279         u16 reg16, dev_id, sdev_id;
280         unsigned long timeout;
281         struct pci_dev *sdev;
282         int cap, err;
283         u32 reg32;
284
285         /* Check that all functions under the pci bridge are PFs of
286          * this device otherwise fail this function.
287          */
288         err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id);
289         if (err)
290                 return err;
291         list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
292                 err = pci_read_config_word(sdev, PCI_DEVICE_ID, &sdev_id);
293                 if (err)
294                         return err;
295                 if (sdev_id != dev_id)
296                         return -EPERM;
297         }
298
299         cap = pci_find_capability(bridge, PCI_CAP_ID_EXP);
300         if (!cap)
301                 return -EOPNOTSUPP;
302
303         list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
304                 pci_save_state(sdev);
305                 pci_cfg_access_lock(sdev);
306         }
307         /* PCI link toggle */
308         err = pci_read_config_word(bridge, cap + PCI_EXP_LNKCTL, &reg16);
309         if (err)
310                 return err;
311         reg16 |= PCI_EXP_LNKCTL_LD;
312         err = pci_write_config_word(bridge, cap + PCI_EXP_LNKCTL, reg16);
313         if (err)
314                 return err;
315         msleep(500);
316         reg16 &= ~PCI_EXP_LNKCTL_LD;
317         err = pci_write_config_word(bridge, cap + PCI_EXP_LNKCTL, reg16);
318         if (err)
319                 return err;
320
321         /* Check link */
322         err = pci_read_config_dword(bridge, cap + PCI_EXP_LNKCAP, &reg32);
323         if (err)
324                 return err;
325         if (!(reg32 & PCI_EXP_LNKCAP_DLLLARC)) {
326                 mlx5_core_warn(dev, "No PCI link reporting capability (0x%08x)\n", reg32);
327                 msleep(1000);
328                 goto restore;
329         }
330
331         timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE));
332         do {
333                 err = pci_read_config_word(bridge, cap + PCI_EXP_LNKSTA, &reg16);
334                 if (err)
335                         return err;
336                 if (reg16 & PCI_EXP_LNKSTA_DLLLA)
337                         break;
338                 msleep(20);
339         } while (!time_after(jiffies, timeout));
340
341         if (reg16 & PCI_EXP_LNKSTA_DLLLA) {
342                 mlx5_core_info(dev, "PCI Link up\n");
343         } else {
344                 mlx5_core_err(dev, "PCI link not ready (0x%04x) after %llu ms\n",
345                               reg16, mlx5_tout_ms(dev, PCI_TOGGLE));
346                 err = -ETIMEDOUT;
347         }
348
349 restore:
350         list_for_each_entry(sdev, &bridge_bus->devices, bus_list) {
351                 pci_cfg_access_unlock(sdev);
352                 pci_restore_state(sdev);
353         }
354
355         return err;
356 }
357
358 static void mlx5_sync_reset_now_event(struct work_struct *work)
359 {
360         struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
361                                                       reset_now_work);
362         struct mlx5_core_dev *dev = fw_reset->dev;
363         int err;
364
365         mlx5_sync_reset_clear_reset_requested(dev, false);
366
367         mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n");
368
369         err = mlx5_cmd_fast_teardown_hca(dev);
370         if (err) {
371                 mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err);
372                 goto done;
373         }
374
375         err = mlx5_pci_link_toggle(dev);
376         if (err) {
377                 mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, no reset done, err %d\n", err);
378                 goto done;
379         }
380
381         mlx5_enter_error_state(dev, true);
382         mlx5_unload_one(dev);
383 done:
384         fw_reset->ret = err;
385         mlx5_fw_reset_complete_reload(dev);
386 }
387
388 static void mlx5_sync_reset_abort_event(struct work_struct *work)
389 {
390         struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
391                                                       reset_abort_work);
392         struct mlx5_core_dev *dev = fw_reset->dev;
393
394         if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags))
395                 return;
396
397         mlx5_sync_reset_clear_reset_requested(dev, true);
398         mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n");
399 }
400
401 static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct mlx5_eqe *eqe)
402 {
403         struct mlx5_eqe_sync_fw_update *sync_fw_update_eqe;
404         u8 sync_event_rst_type;
405
406         sync_fw_update_eqe = &eqe->data.sync_fw_update;
407         sync_event_rst_type = sync_fw_update_eqe->sync_rst_state & SYNC_RST_STATE_MASK;
408         switch (sync_event_rst_type) {
409         case MLX5_SYNC_RST_STATE_RESET_REQUEST:
410                 queue_work(fw_reset->wq, &fw_reset->reset_request_work);
411                 break;
412         case MLX5_SYNC_RST_STATE_RESET_NOW:
413                 queue_work(fw_reset->wq, &fw_reset->reset_now_work);
414                 break;
415         case MLX5_SYNC_RST_STATE_RESET_ABORT:
416                 queue_work(fw_reset->wq, &fw_reset->reset_abort_work);
417                 break;
418         }
419 }
420
421 static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data)
422 {
423         struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb);
424         struct mlx5_eqe *eqe = data;
425
426         switch (eqe->sub_type) {
427         case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT:
428                         queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work);
429                 break;
430         case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT:
431                 mlx5_sync_reset_events_handle(fw_reset, eqe);
432                 break;
433         default:
434                 return NOTIFY_DONE;
435         }
436
437         return NOTIFY_OK;
438 }
439
440 int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev)
441 {
442         unsigned long pci_sync_update_timeout = mlx5_tout_ms(dev, PCI_SYNC_UPDATE);
443         unsigned long timeout = msecs_to_jiffies(pci_sync_update_timeout);
444         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
445         int err;
446
447         if (!wait_for_completion_timeout(&fw_reset->done, timeout)) {
448                 mlx5_core_warn(dev, "FW sync reset timeout after %lu seconds\n",
449                                pci_sync_update_timeout / 1000);
450                 err = -ETIMEDOUT;
451                 goto out;
452         }
453         err = fw_reset->ret;
454 out:
455         clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
456         return err;
457 }
458
459 void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev)
460 {
461         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
462
463         MLX5_NB_INIT(&fw_reset->nb, fw_reset_event_notifier, GENERAL_EVENT);
464         mlx5_eq_notifier_register(dev, &fw_reset->nb);
465 }
466
467 void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev)
468 {
469         mlx5_eq_notifier_unregister(dev, &dev->priv.fw_reset->nb);
470 }
471
472 int mlx5_fw_reset_init(struct mlx5_core_dev *dev)
473 {
474         struct mlx5_fw_reset *fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL);
475
476         if (!fw_reset)
477                 return -ENOMEM;
478         fw_reset->wq = create_singlethread_workqueue("mlx5_fw_reset_events");
479         if (!fw_reset->wq) {
480                 kfree(fw_reset);
481                 return -ENOMEM;
482         }
483
484         fw_reset->dev = dev;
485         dev->priv.fw_reset = fw_reset;
486
487         INIT_WORK(&fw_reset->fw_live_patch_work, mlx5_fw_live_patch_event);
488         INIT_WORK(&fw_reset->reset_request_work, mlx5_sync_reset_request_event);
489         INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work);
490         INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event);
491         INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event);
492
493         init_completion(&fw_reset->done);
494         return 0;
495 }
496
497 void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev)
498 {
499         struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
500
501         destroy_workqueue(fw_reset->wq);
502         kfree(dev->priv.fw_reset);
503 }