habanalabs: staged submission support
authorOfir Bitton <obitton@habana.ai>
Tue, 8 Dec 2020 11:47:05 +0000 (13:47 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Wed, 27 Jan 2021 19:03:51 +0000 (21:03 +0200)
We introduce a new mechanism named Staged Submission.
This mechanism allows the user to send a whole CS in pieces.
Each CS will not require completion rather than the
last CS. Timeout timer will be triggered upon reception of the first
CS in group.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/command_submission.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/hw_queue.c
drivers/misc/habanalabs/gaudi/gaudi.c

index 57daff0..7bd4a03 100644 (file)
@@ -334,6 +334,133 @@ static void complete_job(struct hl_device *hdev, struct hl_cs_job *job)
        cs_job_put(job);
 }
 
+/*
+ * hl_staged_cs_find_first - locate the first CS in this staged submission
+ *
+ * @hdev: pointer to device structure
+ * @cs_seq: staged submission sequence number
+ *
+ * @note: This function must be called under 'hdev->cs_mirror_lock'
+ *
+ * Find and return a CS pointer with the given sequence
+ */
+struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq)
+{
+       struct hl_cs *cs;
+
+       list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node)
+               if (cs->staged_cs && cs->staged_first &&
+                               cs->sequence == cs_seq)
+                       return cs;
+
+       return NULL;
+}
+
+/*
+ * is_staged_cs_last_exists - returns true if the last CS in sequence exists
+ *
+ * @hdev: pointer to device structure
+ * @cs: staged submission member
+ *
+ */
+bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs)
+{
+       struct hl_cs *last_entry;
+
+       last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs,
+                                                               staged_cs_node);
+
+       if (last_entry->staged_last)
+               return true;
+
+       return false;
+}
+
+/*
+ * staged_cs_get - get CS reference if this CS is a part of a staged CS
+ *
+ * @hdev: pointer to device structure
+ * @cs: current CS
+ * @cs_seq: staged submission sequence number
+ *
+ * Increment CS reference for every CS in this staged submission except for
+ * the CS which get completion.
+ */
+static void staged_cs_get(struct hl_device *hdev, struct hl_cs *cs)
+{
+       /* Only the last CS in this staged submission will get a completion.
+        * We must increment the reference for all other CS's in this
+        * staged submission.
+        * Once we get a completion we will release the whole staged submission.
+        */
+       if (!cs->staged_last)
+               cs_get(cs);
+}
+
+/*
+ * staged_cs_put - put a CS in case it is part of staged submission
+ *
+ * @hdev: pointer to device structure
+ * @cs: CS to put
+ *
+ * This function decrements a CS reference (for a non completion CS)
+ */
+static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs)
+{
+       /* We release all CS's in a staged submission except the last
+        * CS which we have never incremented its reference.
+        */
+       if (!cs_needs_completion(cs))
+               cs_put(cs);
+}
+
+static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
+{
+       bool next_entry_found = false;
+       struct hl_cs *next;
+
+       if (!cs_needs_timeout(cs))
+               return;
+
+       spin_lock(&hdev->cs_mirror_lock);
+
+       /* We need to handle tdr only once for the complete staged submission.
+        * Hence, we choose the CS that reaches this function first which is
+        * the CS marked as 'staged_last'.
+        */
+       if (cs->staged_cs && cs->staged_last)
+               cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
+
+       spin_unlock(&hdev->cs_mirror_lock);
+
+       /* Don't cancel TDR in case this CS was timedout because we might be
+        * running from the TDR context
+        */
+       if (cs && (cs->timedout ||
+                       hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT))
+               return;
+
+       if (cs && cs->tdr_active)
+               cancel_delayed_work_sync(&cs->work_tdr);
+
+       spin_lock(&hdev->cs_mirror_lock);
+
+       /* queue TDR for next CS */
+       list_for_each_entry(next, &hdev->cs_mirror_list, mirror_node)
+               if (cs_needs_timeout(next)) {
+                       next_entry_found = true;
+                       break;
+               }
+
+       if (next_entry_found && !next->tdr_active) {
+               next->tdr_active = true;
+               schedule_delayed_work(&next->work_tdr,
+                                       hdev->timeout_jiffies);
+       }
+
+       spin_unlock(&hdev->cs_mirror_lock);
+}
+
 static void cs_do_release(struct kref *ref)
 {
        struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
@@ -391,32 +518,29 @@ static void cs_do_release(struct kref *ref)
        list_del_init(&cs->mirror_node);
        spin_unlock(&hdev->cs_mirror_lock);
 
-       /* Don't cancel TDR in case this CS was timedout because we might be
-        * running from the TDR context
-        */
-       if (!cs->timedout && hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) {
-               bool next_entry_found = false;
-               struct hl_cs *next;
-
-               if (cs->tdr_active)
-                       cancel_delayed_work_sync(&cs->work_tdr);
+       cs_handle_tdr(hdev, cs);
 
-               spin_lock(&hdev->cs_mirror_lock);
-
-               /* queue TDR for next CS */
-               list_for_each_entry(next, &hdev->cs_mirror_list, mirror_node)
-                       if (cs_needs_timeout(next)) {
-                               next_entry_found = true;
-                               break;
-                       }
+       if (cs->staged_cs) {
+               /* the completion CS decrements reference for the entire
+                * staged submission
+                */
+               if (cs->staged_last) {
+                       struct hl_cs *staged_cs, *tmp;
 
-               if (next_entry_found && !next->tdr_active) {
-                       next->tdr_active = true;
-                       schedule_delayed_work(&next->work_tdr,
-                                               hdev->timeout_jiffies);
+                       list_for_each_entry_safe(staged_cs, tmp,
+                                       &cs->staged_cs_node, staged_cs_node)
+                               staged_cs_put(hdev, staged_cs);
                }
 
-               spin_unlock(&hdev->cs_mirror_lock);
+               /* A staged CS will be a member in the list only after it
+                * was submitted. We used 'cs_mirror_lock' when inserting
+                * it to list so we will use it again when removing it
+                */
+               if (cs->submitted) {
+                       spin_lock(&hdev->cs_mirror_lock);
+                       list_del(&cs->staged_cs_node);
+                       spin_unlock(&hdev->cs_mirror_lock);
+               }
        }
 
 out:
@@ -614,6 +738,8 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
 {
        struct hl_cs_job *job, *tmp;
 
+       staged_cs_put(hdev, cs);
+
        list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
                complete_job(hdev, job);
 }
@@ -623,7 +749,9 @@ void hl_cs_rollback_all(struct hl_device *hdev)
        int i;
        struct hl_cs *cs, *tmp;
 
-       /* flush all completions */
+       /* flush all completions before iterating over the CS mirror list in
+        * order to avoid a race with the release functions
+        */
        for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
                flush_workqueue(hdev->cq_wq[i]);
 
@@ -632,7 +760,7 @@ void hl_cs_rollback_all(struct hl_device *hdev)
                cs_get(cs);
                cs->aborted = true;
                dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
-                                       cs->ctx->asid, cs->sequence);
+                               cs->ctx->asid, cs->sequence);
                cs_rollback(hdev, cs);
                cs_put(cs);
        }
@@ -804,6 +932,12 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
                return -EBUSY;
        }
 
+       if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
+                       !hdev->supports_staged_submission) {
+               dev_err(hdev->dev, "staged submission not supported");
+               return -EPERM;
+       }
+
        cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
 
        if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
@@ -875,6 +1009,34 @@ static int hl_cs_copy_chunk_array(struct hl_device *hdev,
        return 0;
 }
 
+static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
+                               u64 sequence, u32 flags)
+{
+       if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
+               return 0;
+
+       cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST);
+       cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST);
+
+       if (cs->staged_first) {
+               /* Staged CS sequence is the first CS sequence */
+               INIT_LIST_HEAD(&cs->staged_cs_node);
+               cs->staged_sequence = cs->sequence;
+       } else {
+               /* User sequence will be validated in 'hl_hw_queue_schedule_cs'
+                * under the cs_mirror_lock
+                */
+               cs->staged_sequence = sequence;
+       }
+
+       /* Increment CS reference if needed */
+       staged_cs_get(hdev, cs);
+
+       cs->staged_cs = true;
+
+       return 0;
+}
+
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
                                u32 num_chunks, u64 *cs_seq, u32 flags)
 {
@@ -914,6 +1076,10 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 
        hl_debugfs_add_cs(cs);
 
+       rc = cs_staged_submission(hdev, cs, user_sequence, flags);
+       if (rc)
+               goto free_cs_object;
+
        /* Validate ALL the CS chunks before submitting the CS */
        for (i = 0 ; i < num_chunks ; i++) {
                struct hl_cs_chunk *chunk = &cs_chunk_array[i];
index be7947d..30f32f2 100644 (file)
@@ -1169,8 +1169,11 @@ struct hl_userptr {
  * @finish_work: workqueue object to run when CS is completed by H/W.
  * @work_tdr: delayed work node for TDR.
  * @mirror_node : node in device mirror list of command submissions.
+ * @staged_cs_node: node in the staged cs list.
  * @debugfs_list: node in debugfs list of command submissions.
  * @sequence: the sequence number of this CS.
+ * @staged_sequence: the sequence of the staged submission this CS is part of,
+ *                   relevant only if staged_cs is set.
  * @type: CS_TYPE_*.
  * @submitted: true if CS was submitted to H/W.
  * @completed: true if CS was completed by device.
@@ -1195,8 +1198,10 @@ struct hl_cs {
        struct work_struct      finish_work;
        struct delayed_work     work_tdr;
        struct list_head        mirror_node;
+       struct list_head        staged_cs_node;
        struct list_head        debugfs_list;
        u64                     sequence;
+       u64                     staged_sequence;
        enum hl_cs_type         type;
        u8                      submitted;
        u8                      completed;
@@ -1905,6 +1910,7 @@ struct hl_mmu_funcs {
  *                          user processes
  * @device_fini_pending: true if device_fini was called and might be
  *                       waiting for the reset thread to finish
+ * @supports_staged_submission: true if staged submissions are supported
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -2010,6 +2016,7 @@ struct hl_device {
        u8                              needs_reset;
        u8                              process_kill_trial_cnt;
        u8                              device_fini_pending;
+       u8                              supports_staged_submission;
 
        /* Parameters for bring-up */
        u64                             nic_ports_mask;
@@ -2207,6 +2214,8 @@ void hl_fence_get(struct hl_fence *fence);
 void cs_get(struct hl_cs *cs);
 bool cs_needs_completion(struct hl_cs *cs);
 bool cs_needs_timeout(struct hl_cs *cs);
+bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
+struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
index ad440ae..0f33518 100644 (file)
@@ -596,6 +596,31 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
                hdev->asic_funcs->collective_wait_init_cs(cs);
 
        spin_lock(&hdev->cs_mirror_lock);
+
+       /* Verify staged CS exists and add to the staged list */
+       if (cs->staged_cs && !cs->staged_first) {
+               struct hl_cs *staged_cs;
+
+               staged_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
+               if (!staged_cs) {
+                       dev_err(hdev->dev,
+                               "Cannot find staged submission sequence %llu",
+                               cs->staged_sequence);
+                       rc = -EINVAL;
+                       goto unlock_cs_mirror;
+               }
+
+               if (is_staged_cs_last_exists(hdev, staged_cs)) {
+                       dev_err(hdev->dev,
+                               "Staged submission sequence %llu already submitted",
+                               cs->staged_sequence);
+                       rc = -EINVAL;
+                       goto unlock_cs_mirror;
+               }
+
+               list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node);
+       }
+
        list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
 
        /* Queue TDR if the CS is the first entry and if timeout is wanted */
@@ -637,6 +662,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
        goto out;
 
+unlock_cs_mirror:
+       spin_unlock(&hdev->cs_mirror_lock);
 unroll_cq_resv:
        q = &hdev->kernel_queues[0];
        for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) {
index 9a3d2fb..1348016 100644 (file)
@@ -1627,6 +1627,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
 
        hdev->supports_sync_stream = true;
        hdev->supports_coresight = true;
+       hdev->supports_staged_submission = true;
 
        return 0;