Merge tag 'char-misc-5.15-rc1-lkdtm' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-2.6-microblaze.git] / drivers / misc / habanalabs / common / device.c
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c

index ff4cbde..97c7c86 100644 (file)
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -7,11 +7,11 @@
  
  #define pr_fmt(fmt)                    "habanalabs: " fmt
  
+#include <uapi/misc/habanalabs.h>
  #include "habanalabs.h"
  
  #include <linux/pci.h>
  #include <linux/hwmon.h>
-#include <uapi/misc/habanalabs.h>
  
  enum hl_device_status hl_device_status(struct hl_device *hdev)
  {
@@ -23,6 +23,8 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
                 status = HL_DEVICE_STATUS_NEEDS_RESET;
         else if (hdev->disabled)
                 status = HL_DEVICE_STATUS_MALFUNCTION;
+       else if (!hdev->init_done)
+               status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
         else
                 status = HL_DEVICE_STATUS_OPERATIONAL;
  
@@ -44,6 +46,7 @@ bool hl_device_operational(struct hl_device *hdev,
         case HL_DEVICE_STATUS_NEEDS_RESET:
                 return false;
         case HL_DEVICE_STATUS_OPERATIONAL:
+       case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
         default:
                 return true;
         }
@@ -129,8 +132,8 @@ static int hl_device_release(struct inode *inode, struct file *filp)
         hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
  
         if (!hl_hpriv_put(hpriv))
-               dev_warn(hdev->dev,
-                       "Device is still in use because there are live CS and/or memory mappings\n");
+               dev_notice(hdev->dev,
+                       "User process closed FD but device still in use\n");
  
         hdev->last_open_session_duration_jif =
                 jiffies - hdev->last_successful_open_jif;
@@ -308,9 +311,15 @@ static void device_hard_reset_pending(struct work_struct *work)
                 container_of(work, struct hl_device_reset_work,
                                 reset_work.work);
         struct hl_device *hdev = device_reset_work->hdev;
+       u32 flags;
         int rc;
  
-       rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD);
+       flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
+
+       if (device_reset_work->fw_reset)
+               flags |= HL_RESET_FW;
+
+       rc = hl_device_reset(hdev, flags);
         if ((rc == -EBUSY) && !hdev->device_fini_pending) {
                 dev_info(hdev->dev,
                         "Could not reset device. will try again in %u seconds",
@@ -682,6 +691,44 @@ out:
         return rc;
  }
  
+static void take_release_locks(struct hl_device *hdev)
+{
+       /* Flush anyone that is inside the critical section of enqueue
+        * jobs to the H/W
+        */
+       hdev->asic_funcs->hw_queues_lock(hdev);
+       hdev->asic_funcs->hw_queues_unlock(hdev);
+
+       /* Flush processes that are sending message to CPU */
+       mutex_lock(&hdev->send_cpu_message_lock);
+       mutex_unlock(&hdev->send_cpu_message_lock);
+
+       /* Flush anyone that is inside device open */
+       mutex_lock(&hdev->fpriv_list_lock);
+       mutex_unlock(&hdev->fpriv_list_lock);
+}
+
+static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
+{
+       if (hard_reset)
+               device_late_fini(hdev);
+
+       /*
+        * Halt the engines and disable interrupts so we won't get any more
+        * completions from H/W and we won't have any accesses from the
+        * H/W to the host machine
+        */
+       hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
+
+       /* Go over all the queues, release all CS and their jobs */
+       hl_cs_rollback_all(hdev);
+
+       /* Release all pending user interrupts, each pending user interrupt
+        * holds a reference to user context
+        */
+       hl_release_pending_user_interrupts(hdev);
+}
+
  /*
   * hl_device_suspend - initiate device suspend
   *
@@ -707,16 +754,7 @@ int hl_device_suspend(struct hl_device *hdev)
         /* This blocks all other stuff that is not blocked by in_reset */
         hdev->disabled = true;
  
-       /*
-        * Flush anyone that is inside the critical section of enqueue
-        * jobs to the H/W
-        */
-       hdev->asic_funcs->hw_queues_lock(hdev);
-       hdev->asic_funcs->hw_queues_unlock(hdev);
-
-       /* Flush processes that are sending message to CPU */
-       mutex_lock(&hdev->send_cpu_message_lock);
-       mutex_unlock(&hdev->send_cpu_message_lock);
+       take_release_locks(hdev);
  
         rc = hdev->asic_funcs->suspend(hdev);
         if (rc)
@@ -819,6 +857,11 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
                         usleep_range(1000, 10000);
  
                         put_task_struct(task);
+               } else {
+                       dev_warn(hdev->dev,
+                               "Can't get task struct for PID so giving up on killing process\n");
+                       mutex_unlock(&hdev->fpriv_list_lock);
+                       return -ETIME;
                 }
         }
  
@@ -885,7 +928,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
  int hl_device_reset(struct hl_device *hdev, u32 flags)
  {
         u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-       bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
+       bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
         int i, rc;
  
         if (!hdev->init_done) {
@@ -894,8 +937,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
                 return 0;
         }
  
-       hard_reset = (flags & HL_RESET_HARD) != 0;
-       from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
+       hard_reset = !!(flags & HL_RESET_HARD);
+       from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
+       fw_reset = !!(flags & HL_RESET_FW);
  
         if (!hard_reset && !hdev->supports_soft_reset) {
                 hard_instead_soft = true;
@@ -947,11 +991,13 @@ do_reset:
                 else
                         hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
  
-               /*
-                * if reset is due to heartbeat, device CPU is no responsive in
-                * which case no point sending PCI disable message to it
+               /* If reset is due to heartbeat, device CPU is no responsive in
+                * which case no point sending PCI disable message to it.
+                *
+                * If F/W is performing the reset, no need to send it a message to disable
+                * PCI access
                  */
-               if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) {
+               if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
                         /* Disable PCI access from device F/W so he won't send
                          * us additional interrupts. We disable MSI/MSI-X at
                          * the halt_engines function and we can't have the F/W
@@ -970,15 +1016,7 @@ do_reset:
                 /* This also blocks future CS/VM/JOB completion operations */
                 hdev->disabled = true;
  
-               /* Flush anyone that is inside the critical section of enqueue
-                * jobs to the H/W
-                */
-               hdev->asic_funcs->hw_queues_lock(hdev);
-               hdev->asic_funcs->hw_queues_unlock(hdev);
-
-               /* Flush anyone that is inside device open */
-               mutex_lock(&hdev->fpriv_list_lock);
-               mutex_unlock(&hdev->fpriv_list_lock);
+               take_release_locks(hdev);
  
                 dev_err(hdev->dev, "Going to RESET device!\n");
         }
@@ -989,6 +1027,8 @@ again:
  
                 hdev->process_kill_trial_cnt = 0;
  
+               hdev->device_reset_work.fw_reset = fw_reset;
+
                 /*
                  * Because the reset function can't run from heartbeat work,
                  * we need to call the reset function from a dedicated work.
@@ -999,31 +1039,7 @@ again:
                 return 0;
         }
  
-       if (hard_reset) {
-               device_late_fini(hdev);
-
-               /*
-                * Now that the heartbeat thread is closed, flush processes
-                * which are sending messages to CPU
-                */
-               mutex_lock(&hdev->send_cpu_message_lock);
-               mutex_unlock(&hdev->send_cpu_message_lock);
-       }
-
-       /*
-        * Halt the engines and disable interrupts so we won't get any more
-        * completions from H/W and we won't have any accesses from the
-        * H/W to the host machine
-        */
-       hdev->asic_funcs->halt_engines(hdev, hard_reset);
-
-       /* Go over all the queues, release all CS and their jobs */
-       hl_cs_rollback_all(hdev);
-
-       /* Release all pending user interrupts, each pending user interrupt
-        * holds a reference to user context
-        */
-       hl_release_pending_user_interrupts(hdev);
+       cleanup_resources(hdev, hard_reset, fw_reset);
  
  kill_processes:
         if (hard_reset) {
@@ -1057,12 +1073,15 @@ kill_processes:
         }
  
         /* Reset the H/W. It will be in idle state after this returns */
-       hdev->asic_funcs->hw_fini(hdev, hard_reset);
+       hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
  
         if (hard_reset) {
+               hdev->fw_loader.linux_loaded = false;
+
                 /* Release kernel context */
                 if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
                         hdev->kernel_ctx = NULL;
+
                 hl_vm_fini(hdev);
                 hl_mmu_fini(hdev);
                 hl_eq_reset(hdev, &hdev->event_queue);
@@ -1292,6 +1311,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
         if (rc)
                 goto user_interrupts_fini;
  
+
+       /* initialize completion structure for multi CS wait */
+       hl_multi_cs_completion_init(hdev);
+
         /*
          * Initialize the H/W queues. Must be done before hw_init, because
          * there the addresses of the kernel queue are being written to the
@@ -1361,6 +1384,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
  
         hdev->compute_ctx = NULL;
  
+       hdev->asic_funcs->state_dump_init(hdev);
+
         hl_debugfs_add_device(hdev);
  
         /* debugfs nodes are created in hl_ctx_init so it must be called after
@@ -1567,31 +1592,13 @@ void hl_device_fini(struct hl_device *hdev)
         /* Mark device as disabled */
         hdev->disabled = true;
  
-       /* Flush anyone that is inside the critical section of enqueue
-        * jobs to the H/W
-        */
-       hdev->asic_funcs->hw_queues_lock(hdev);
-       hdev->asic_funcs->hw_queues_unlock(hdev);
-
-       /* Flush anyone that is inside device open */
-       mutex_lock(&hdev->fpriv_list_lock);
-       mutex_unlock(&hdev->fpriv_list_lock);
+       take_release_locks(hdev);
  
         hdev->hard_reset_pending = true;
  
         hl_hwmon_fini(hdev);
  
-       device_late_fini(hdev);
-
-       /*
-        * Halt the engines and disable interrupts so we won't get any more
-        * completions from H/W and we won't have any accesses from the
-        * H/W to the host machine
-        */
-       hdev->asic_funcs->halt_engines(hdev, true);
-
-       /* Go over all the queues, release all CS and their jobs */
-       hl_cs_rollback_all(hdev);
+       cleanup_resources(hdev, true, false);
  
         /* Kill processes here after CS rollback. This is because the process
          * can't really exit until all its CSs are done, which is what we
@@ -1610,7 +1617,9 @@ void hl_device_fini(struct hl_device *hdev)
         hl_cb_pool_fini(hdev);
  
         /* Reset the H/W. It will be in idle state after this returns */
-       hdev->asic_funcs->hw_fini(hdev, true);
+       hdev->asic_funcs->hw_fini(hdev, true, false);
+
+       hdev->fw_loader.linux_loaded = false;
  
         /* Release kernel context */
         if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))