habanalabs: add a lock to protect multiple reset variables
authorOfir Bitton <obitton@habana.ai>
Tue, 23 Nov 2021 13:15:22 +0000 (15:15 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Sun, 26 Dec 2021 12:42:11 +0000 (14:42 +0200)
Atomic operations during reset are replaced by a spinlock in order
to have the ability to protect more than a single variable.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/command_buffer.c
drivers/misc/habanalabs/common/debugfs.c
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/habanalabs.h

index 649380b..3c0ae07 100644 (file)
@@ -250,8 +250,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
         * Can't use generic function to check this because of special case
         * where we create a CB as part of the reset process
         */
-       if ((hdev->disabled) || ((atomic_read(&hdev->reset_info.in_reset)) &&
-                                       (ctx_id != HL_KERNEL_ASID_ID))) {
+       if ((hdev->disabled) || (hdev->reset_info.in_reset && (ctx_id != HL_KERNEL_ASID_ID))) {
                dev_warn_ratelimited(hdev->dev,
                        "Device is disabled or in reset. Can't create new CBs\n");
                rc = -EBUSY;
index 746d1a1..fc084ee 100644 (file)
@@ -527,7 +527,7 @@ static int engines_show(struct seq_file *s, void *data)
        struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
        struct hl_device *hdev = dev_entry->hdev;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev,
                                "Can't check device idle during reset\n");
                return 0;
@@ -658,7 +658,7 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
        ssize_t rc;
        u32 val;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
                return 0;
        }
@@ -694,7 +694,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
        u32 value;
        ssize_t rc;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
                return 0;
        }
@@ -731,7 +731,7 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf,
        ssize_t rc;
        u64 val;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
                return 0;
        }
@@ -767,7 +767,7 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf,
        u64 value;
        ssize_t rc;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
                return 0;
        }
@@ -802,7 +802,7 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf,
        ssize_t rc;
        u32 size;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n");
                return 0;
        }
@@ -1077,7 +1077,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
        u64 value;
        ssize_t rc;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev,
                                "Can't change clock gating during reset\n");
                return 0;
@@ -1119,7 +1119,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
        u32 value;
        ssize_t rc;
 
-       if (atomic_read(&hdev->reset_info.in_reset)) {
+       if (hdev->reset_info.in_reset) {
                dev_warn_ratelimited(hdev->dev,
                                "Can't change stop on error during reset\n");
                return 0;
index f8f9eb7..84621ad 100644 (file)
@@ -17,7 +17,7 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
        enum hl_device_status status;
 
-       if (atomic_read(&hdev->reset_info.in_reset))
+       if (hdev->reset_info.in_reset)
                status = HL_DEVICE_STATUS_IN_RESET;
        else if (hdev->reset_info.needs_reset)
                status = HL_DEVICE_STATUS_NEEDS_RESET;
@@ -448,11 +448,11 @@ static int device_early_init(struct hl_device *hdev)
        mutex_init(&hdev->debug_lock);
        INIT_LIST_HEAD(&hdev->cs_mirror_list);
        spin_lock_init(&hdev->cs_mirror_lock);
+       spin_lock_init(&hdev->reset_info.lock);
        INIT_LIST_HEAD(&hdev->fpriv_list);
        INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
        mutex_init(&hdev->fpriv_list_lock);
        mutex_init(&hdev->fpriv_ctrl_list_lock);
-       atomic_set(&hdev->reset_info.in_reset, 0);
        mutex_init(&hdev->clk_throttling.lock);
 
        return 0;
@@ -544,7 +544,7 @@ reschedule:
         * status for at least one heartbeat. From this point driver restarts
         * tracking future consecutive fatal errors.
         */
-       if (!(atomic_read(&hdev->reset_info.in_reset)))
+       if (!hdev->reset_info.in_reset)
                hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
 
        schedule_delayed_work(&hdev->work_heartbeat,
@@ -722,11 +722,14 @@ int hl_device_suspend(struct hl_device *hdev)
        pci_save_state(hdev->pdev);
 
        /* Block future CS/VM/JOB completion operations */
-       rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
-       if (rc) {
+       spin_lock(&hdev->reset_info.lock);
+       if (hdev->reset_info.in_reset) {
+               spin_unlock(&hdev->reset_info.lock);
                dev_err(hdev->dev, "Can't suspend while in reset\n");
                return -EIO;
        }
+       hdev->reset_info.in_reset = 1;
+       spin_unlock(&hdev->reset_info.lock);
 
        /* This blocks all other stuff that is not blocked by in_reset */
        hdev->disabled = true;
@@ -776,8 +779,10 @@ int hl_device_resume(struct hl_device *hdev)
        }
 
 
-       hdev->disabled = false;
-       atomic_set(&hdev->reset_info.in_reset, 0);
+       /* 'in_reset' was set to true during suspend, now we must clear it in order
+        * for hard reset to be performed
+        */
+       hdev->reset_info.in_reset = 0;
 
        rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
        if (rc) {
@@ -1024,9 +1029,13 @@ do_reset:
         */
        if (!from_hard_reset_thread) {
                /* Block future CS/VM/JOB completion operations */
-               rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
-               if (rc)
+               spin_lock(&hdev->reset_info.lock);
+               if (hdev->reset_info.in_reset) {
+                       spin_unlock(&hdev->reset_info.lock);
                        return 0;
+               }
+               hdev->reset_info.in_reset = 1;
+               spin_unlock(&hdev->reset_info.lock);
 
                handle_reset_trigger(hdev, flags);
 
@@ -1234,7 +1243,7 @@ kill_processes:
                }
        }
 
-       atomic_set(&hdev->reset_info.in_reset, 0);
+       hdev->reset_info.in_reset = 0;
        hdev->reset_info.needs_reset = false;
 
        dev_notice(hdev->dev, "Successfully finished resetting the device\n");
@@ -1272,7 +1281,7 @@ out_err:
                goto again;
        }
 
-       atomic_set(&hdev->reset_info.in_reset, 0);
+       hdev->reset_info.in_reset = 0;
 
        return rc;
 }
@@ -1583,6 +1592,7 @@ out_disabled:
  */
 void hl_device_fini(struct hl_device *hdev)
 {
+       bool device_in_reset;
        ktime_t timeout;
        u64 reset_sec;
        int i, rc;
@@ -1606,10 +1616,22 @@ void hl_device_fini(struct hl_device *hdev)
         */
 
        timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
-       rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
-       while (rc) {
+
+       spin_lock(&hdev->reset_info.lock);
+       device_in_reset = !!hdev->reset_info.in_reset;
+       if (!device_in_reset)
+               hdev->reset_info.in_reset = 1;
+       spin_unlock(&hdev->reset_info.lock);
+
+       while (device_in_reset) {
                usleep_range(50, 200);
-               rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
+
+               spin_lock(&hdev->reset_info.lock);
+               device_in_reset = !!hdev->reset_info.in_reset;
+               if (!device_in_reset)
+                       hdev->reset_info.in_reset = 1;
+               spin_unlock(&hdev->reset_info.lock);
+
                if (ktime_compare(ktime_get(), timeout) > 0) {
                        dev_crit(hdev->dev,
                                "Failed to remove device because reset function did not finish\n");
index 47eaeff..37a3a46 100644 (file)
@@ -2457,9 +2457,10 @@ struct last_error_session_info {
 
 /**
  * struct hl_reset_info - holds current device reset information.
- * @in_reset: is device in reset flow.
+ * @lock: lock to protect critical reset flows.
  * @soft_reset_cnt: number of soft reset since the driver was loaded.
  * @hard_reset_cnt: number of hard reset since the driver was loaded.
+ * @in_reset: is device in reset flow.
  * @is_in_soft_reset: Device is currently in soft reset process.
  * @needs_reset: true if reset_on_lockup is false and device should be reset
  *               due to lockup.
@@ -2474,9 +2475,10 @@ struct last_error_session_info {
  *                         complete instead.
  */
 struct hl_reset_info {
-       atomic_t        in_reset;
+       spinlock_t      lock;
        u32             soft_reset_cnt;
        u32             hard_reset_cnt;
+       u8              in_reset;
        u8              is_in_soft_reset;
        u8              needs_reset;
        u8              hard_reset_pending;