1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright 2016-2021 HabanaLabs, Ltd.
8 #include <uapi/drm/habanalabs_accel.h>
9 #include "habanalabs.h"
11 #include <linux/uaccess.h>
12 #include <linux/slab.h>
14 #define HL_CS_FLAGS_TYPE_MASK (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
15 HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \
16 HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND)
19 #define MAX_TS_ITER_NUM 10
22 * enum hl_cs_wait_status - cs wait status
23 * @CS_WAIT_STATUS_BUSY: cs was not completed yet
24 * @CS_WAIT_STATUS_COMPLETED: cs completed
25 * @CS_WAIT_STATUS_GONE: cs completed but fence is already gone
27 enum hl_cs_wait_status {
29 CS_WAIT_STATUS_COMPLETED,
33 static void job_wq_completion(struct work_struct *work);
34 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq,
35 enum hl_cs_wait_status *status, s64 *timestamp);
36 static void cs_do_release(struct kref *ref);
38 static void hl_push_cs_outcome(struct hl_device *hdev,
39 struct hl_cs_outcome_store *outcome_store,
40 u64 seq, ktime_t ts, int error)
42 struct hl_cs_outcome *node;
46 * CS outcome store supports the following operations:
47 * push outcome - store a recent CS outcome in the store
48 * pop outcome - retrieve a SPECIFIC (by seq) CS outcome from the store
49 * It uses 2 lists: used list and free list.
50 * It has a pre-allocated amount of nodes, each node stores
51 * a single CS outcome.
52 * Initially, all the nodes are in the free list.
53 * On push outcome, a node (any) is taken from the free list, its
54 * information is filled in, and the node is moved to the used list.
55 * It is possible, that there are no nodes left in the free list.
56 * In this case, we will lose some information about old outcomes. We
57 * will pop the OLDEST node from the used list, and make it free.
58 * On pop, the node is searched for in the used list (using a search
60 * If found, the node is then removed from the used list, and moved
61 * back to the free list. The outcome data that the node contained is
62 * returned back to the user.
65 spin_lock_irqsave(&outcome_store->db_lock, flags);
67 if (list_empty(&outcome_store->free_list)) {
68 node = list_last_entry(&outcome_store->used_list,
69 struct hl_cs_outcome, list_link);
70 hash_del(&node->map_link);
71 dev_dbg(hdev->dev, "CS %llu outcome was lost\n", node->seq);
73 node = list_last_entry(&outcome_store->free_list,
74 struct hl_cs_outcome, list_link);
77 list_del_init(&node->list_link);
83 list_add(&node->list_link, &outcome_store->used_list);
84 hash_add(outcome_store->outcome_map, &node->map_link, node->seq);
86 spin_unlock_irqrestore(&outcome_store->db_lock, flags);
89 static bool hl_pop_cs_outcome(struct hl_cs_outcome_store *outcome_store,
90 u64 seq, ktime_t *ts, int *error)
92 struct hl_cs_outcome *node;
95 spin_lock_irqsave(&outcome_store->db_lock, flags);
97 hash_for_each_possible(outcome_store->outcome_map, node, map_link, seq)
98 if (node->seq == seq) {
100 *error = node->error;
102 hash_del(&node->map_link);
103 list_del_init(&node->list_link);
104 list_add(&node->list_link, &outcome_store->free_list);
106 spin_unlock_irqrestore(&outcome_store->db_lock, flags);
111 spin_unlock_irqrestore(&outcome_store->db_lock, flags);
116 static void hl_sob_reset(struct kref *ref)
118 struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
120 struct hl_device *hdev = hw_sob->hdev;
122 dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
124 hdev->asic_funcs->reset_sob(hdev, hw_sob);
126 hw_sob->need_reset = false;
129 void hl_sob_reset_error(struct kref *ref)
131 struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
133 struct hl_device *hdev = hw_sob->hdev;
136 "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
137 hw_sob->q_idx, hw_sob->sob_id);
140 void hw_sob_put(struct hl_hw_sob *hw_sob)
143 kref_put(&hw_sob->kref, hl_sob_reset);
146 static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
149 kref_put(&hw_sob->kref, hl_sob_reset_error);
152 void hw_sob_get(struct hl_hw_sob *hw_sob)
155 kref_get(&hw_sob->kref);
159 * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
160 * @sob_base: sob base id
161 * @sob_mask: sob user mask, each bit represents a sob offset from sob base
162 * @mask: generated mask
164 * Return: 0 if given parameters are valid
166 int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
173 if (sob_mask == 0x1) {
174 *mask = ~(1 << (sob_base & 0x7));
176 /* find msb in order to verify sob range is valid */
177 for (i = BITS_PER_BYTE - 1 ; i >= 0 ; i--)
178 if (BIT(i) & sob_mask)
181 if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & 0x7) - 1))
190 static void hl_fence_release(struct kref *kref)
192 struct hl_fence *fence =
193 container_of(kref, struct hl_fence, refcount);
194 struct hl_cs_compl *hl_cs_cmpl =
195 container_of(fence, struct hl_cs_compl, base_fence);
200 void hl_fence_put(struct hl_fence *fence)
202 if (IS_ERR_OR_NULL(fence))
204 kref_put(&fence->refcount, hl_fence_release);
207 void hl_fences_put(struct hl_fence **fence, int len)
211 for (i = 0; i < len; i++, fence++)
212 hl_fence_put(*fence);
215 void hl_fence_get(struct hl_fence *fence)
218 kref_get(&fence->refcount);
221 static void hl_fence_init(struct hl_fence *fence, u64 sequence)
223 kref_init(&fence->refcount);
224 fence->cs_sequence = sequence;
226 fence->timestamp = ktime_set(0, 0);
227 fence->mcs_handling_done = false;
228 init_completion(&fence->completion);
231 void cs_get(struct hl_cs *cs)
233 kref_get(&cs->refcount);
236 static int cs_get_unless_zero(struct hl_cs *cs)
238 return kref_get_unless_zero(&cs->refcount);
241 static void cs_put(struct hl_cs *cs)
243 kref_put(&cs->refcount, cs_do_release);
246 static void cs_job_do_release(struct kref *ref)
248 struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
253 static void hl_cs_job_put(struct hl_cs_job *job)
255 kref_put(&job->refcount, cs_job_do_release);
258 bool cs_needs_completion(struct hl_cs *cs)
260 /* In case this is a staged CS, only the last CS in sequence should
261 * get a completion, any non staged CS will always get a completion
263 if (cs->staged_cs && !cs->staged_last)
269 bool cs_needs_timeout(struct hl_cs *cs)
271 /* In case this is a staged CS, only the first CS in sequence should
272 * get a timeout, any non staged CS will always get a timeout
274 if (cs->staged_cs && !cs->staged_first)
280 static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
283 * Patched CB is created for external queues jobs, and for H/W queues
284 * jobs if the user CB was allocated by driver and MMU is disabled.
286 return (job->queue_type == QUEUE_TYPE_EXT ||
287 (job->queue_type == QUEUE_TYPE_HW &&
288 job->is_kernel_allocated_cb &&
293 * cs_parser - parse the user command submission
295 * @hpriv : pointer to the private data of the fd
296 * @job : pointer to the job that holds the command submission info
298 * The function parses the command submission of the user. It calls the
299 * ASIC specific parser, which returns a list of memory blocks to send
300 * to the device as different command buffers
303 static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
305 struct hl_device *hdev = hpriv->hdev;
306 struct hl_cs_parser parser;
309 parser.ctx_id = job->cs->ctx->asid;
310 parser.cs_sequence = job->cs->sequence;
311 parser.job_id = job->id;
313 parser.hw_queue_id = job->hw_queue_id;
314 parser.job_userptr_list = &job->userptr_list;
315 parser.patched_cb = NULL;
316 parser.user_cb = job->user_cb;
317 parser.user_cb_size = job->user_cb_size;
318 parser.queue_type = job->queue_type;
319 parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
320 job->patched_cb = NULL;
321 parser.completion = cs_needs_completion(job->cs);
323 rc = hdev->asic_funcs->cs_parser(hdev, &parser);
325 if (is_cb_patched(hdev, job)) {
327 job->patched_cb = parser.patched_cb;
328 job->job_cb_size = parser.patched_cb_size;
329 job->contains_dma_pkt = parser.contains_dma_pkt;
330 atomic_inc(&job->patched_cb->cs_cnt);
334 * Whether the parsing worked or not, we don't need the
335 * original CB anymore because it was already parsed and
336 * won't be accessed again for this CS
338 atomic_dec(&job->user_cb->cs_cnt);
339 hl_cb_put(job->user_cb);
342 job->job_cb_size = job->user_cb_size;
348 static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job)
350 struct hl_cs *cs = job->cs;
352 if (is_cb_patched(hdev, job)) {
353 hl_userptr_delete_list(hdev, &job->userptr_list);
356 * We might arrive here from rollback and patched CB wasn't
357 * created, so we need to check it's not NULL
359 if (job->patched_cb) {
360 atomic_dec(&job->patched_cb->cs_cnt);
361 hl_cb_put(job->patched_cb);
365 /* For H/W queue jobs, if a user CB was allocated by driver and MMU is
366 * enabled, the user CB isn't released in cs_parser() and thus should be
367 * released here. This is also true for INT queues jobs which were
368 * allocated by driver.
370 if ((job->is_kernel_allocated_cb &&
371 ((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
372 job->queue_type == QUEUE_TYPE_INT))) {
373 atomic_dec(&job->user_cb->cs_cnt);
374 hl_cb_put(job->user_cb);
378 * This is the only place where there can be multiple threads
379 * modifying the list at the same time
381 spin_lock(&cs->job_lock);
382 list_del(&job->cs_node);
383 spin_unlock(&cs->job_lock);
385 hl_debugfs_remove_job(hdev, job);
387 /* We decrement reference only for a CS that gets completion
388 * because the reference was incremented only for this kind of CS
389 * right before it was scheduled.
391 * In staged submission, only the last CS marked as 'staged_last'
392 * gets completion, hence its release function will be called from here.
393 * As for all the rest CS's in the staged submission which do not get
394 * completion, their CS reference will be decremented by the
395 * 'staged_last' CS during the CS release flow.
396 * All relevant PQ CI counters will be incremented during the CS release
397 * flow by calling 'hl_hw_queue_update_ci'.
399 if (cs_needs_completion(cs) &&
400 (job->queue_type == QUEUE_TYPE_EXT || job->queue_type == QUEUE_TYPE_HW))
407 * hl_staged_cs_find_first - locate the first CS in this staged submission
409 * @hdev: pointer to device structure
410 * @cs_seq: staged submission sequence number
412 * @note: This function must be called under 'hdev->cs_mirror_lock'
414 * Find and return a CS pointer with the given sequence
416 struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq)
420 list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node)
421 if (cs->staged_cs && cs->staged_first &&
422 cs->sequence == cs_seq)
429 * is_staged_cs_last_exists - returns true if the last CS in sequence exists
431 * @hdev: pointer to device structure
432 * @cs: staged submission member
435 bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs)
437 struct hl_cs *last_entry;
439 last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs,
442 if (last_entry->staged_last)
449 * staged_cs_get - get CS reference if this CS is a part of a staged CS
451 * @hdev: pointer to device structure
453 * @cs_seq: staged submission sequence number
455 * Increment CS reference for every CS in this staged submission except for
456 * the CS which get completion.
458 static void staged_cs_get(struct hl_device *hdev, struct hl_cs *cs)
460 /* Only the last CS in this staged submission will get a completion.
461 * We must increment the reference for all other CS's in this
463 * Once we get a completion we will release the whole staged submission.
465 if (!cs->staged_last)
470 * staged_cs_put - put a CS in case it is part of staged submission
472 * @hdev: pointer to device structure
475 * This function decrements a CS reference (for a non completion CS)
477 static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs)
479 /* We release all CS's in a staged submission except the last
480 * CS which we have never incremented its reference.
482 if (!cs_needs_completion(cs))
486 static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
488 struct hl_cs *next = NULL, *iter, *first_cs;
490 if (!cs_needs_timeout(cs))
493 spin_lock(&hdev->cs_mirror_lock);
495 /* We need to handle tdr only once for the complete staged submission.
496 * Hence, we choose the CS that reaches this function first which is
497 * the CS marked as 'staged_last'.
498 * In case single staged cs was submitted which has both first and last
499 * indications, then "cs_find_first" below will return NULL, since we
500 * removed the cs node from the list before getting here,
501 * in such cases just continue with the cs to cancel it's TDR work.
503 if (cs->staged_cs && cs->staged_last) {
504 first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
509 spin_unlock(&hdev->cs_mirror_lock);
511 /* Don't cancel TDR in case this CS was timedout because we might be
512 * running from the TDR context
514 if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)
518 cancel_delayed_work_sync(&cs->work_tdr);
520 spin_lock(&hdev->cs_mirror_lock);
522 /* queue TDR for next CS */
523 list_for_each_entry(iter, &hdev->cs_mirror_list, mirror_node)
524 if (cs_needs_timeout(iter)) {
529 if (next && !next->tdr_active) {
530 next->tdr_active = true;
531 schedule_delayed_work(&next->work_tdr, next->timeout_jiffies);
534 spin_unlock(&hdev->cs_mirror_lock);
538 * force_complete_multi_cs - complete all contexts that wait on multi-CS
540 * @hdev: pointer to habanalabs device structure
542 static void force_complete_multi_cs(struct hl_device *hdev)
546 for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
547 struct multi_cs_completion *mcs_compl;
549 mcs_compl = &hdev->multi_cs_completion[i];
551 spin_lock(&mcs_compl->lock);
553 if (!mcs_compl->used) {
554 spin_unlock(&mcs_compl->lock);
558 /* when calling force complete no context should be waiting on
560 * We are calling the function as a protection for such case
561 * to free any pending context and print error message
564 "multi-CS completion context %d still waiting when calling force completion\n",
566 complete_all(&mcs_compl->completion);
567 spin_unlock(&mcs_compl->lock);
572 * complete_multi_cs - complete all waiting entities on multi-CS
574 * @hdev: pointer to habanalabs device structure
576 * The function signals a waiting entity that has an overlapping stream masters
577 * with the completed CS.
579 * - a completed CS worked on stream master QID 4, multi CS completion
580 * is actively waiting on stream master QIDs 3, 5. don't send signal as no
581 * common stream master QID
582 * - a completed CS worked on stream master QID 4, multi CS completion
583 * is actively waiting on stream master QIDs 3, 4. send signal as stream
584 * master QID 4 is common
586 static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
588 struct hl_fence *fence = cs->fence;
591 /* in case of multi CS check for completion only for the first CS */
592 if (cs->staged_cs && !cs->staged_first)
595 for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
596 struct multi_cs_completion *mcs_compl;
598 mcs_compl = &hdev->multi_cs_completion[i];
599 if (!mcs_compl->used)
602 spin_lock(&mcs_compl->lock);
606 * 1. still waiting for completion
607 * 2. the completed CS has at least one overlapping stream
608 * master with the stream masters in the completion
610 if (mcs_compl->used &&
611 (fence->stream_master_qid_map &
612 mcs_compl->stream_master_qid_map)) {
613 /* extract the timestamp only of first completed CS */
614 if (!mcs_compl->timestamp)
615 mcs_compl->timestamp = ktime_to_ns(fence->timestamp);
617 complete_all(&mcs_compl->completion);
620 * Setting mcs_handling_done inside the lock ensures
621 * at least one fence have mcs_handling_done set to
622 * true before wait for mcs finish. This ensures at
623 * least one CS will be set as completed when polling
626 fence->mcs_handling_done = true;
629 spin_unlock(&mcs_compl->lock);
631 /* In case CS completed without mcs completion initialized */
632 fence->mcs_handling_done = true;
635 static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
637 struct hl_cs_compl *hl_cs_cmpl)
639 /* Skip this handler if the cs wasn't submitted, to avoid putting
640 * the hw_sob twice, since this case already handled at this point,
641 * also skip if the hw_sob pointer wasn't set.
643 if (!hl_cs_cmpl->hw_sob || !cs->submitted)
646 spin_lock(&hl_cs_cmpl->lock);
649 * we get refcount upon reservation of signals or signal/wait cs for the
650 * hw_sob object, and need to put it when the first staged cs
651 * (which cotains the encaps signals) or cs signal/wait is completed.
653 if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
654 (hl_cs_cmpl->type == CS_TYPE_WAIT) ||
655 (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
656 (!!hl_cs_cmpl->encaps_signals)) {
658 "CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
661 hl_cs_cmpl->hw_sob->sob_id,
662 hl_cs_cmpl->sob_val);
664 hw_sob_put(hl_cs_cmpl->hw_sob);
666 if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
667 hdev->asic_funcs->reset_sob_group(hdev,
668 hl_cs_cmpl->sob_group);
671 spin_unlock(&hl_cs_cmpl->lock);
674 static void cs_do_release(struct kref *ref)
676 struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
677 struct hl_device *hdev = cs->ctx->hdev;
678 struct hl_cs_job *job, *tmp;
679 struct hl_cs_compl *hl_cs_cmpl =
680 container_of(cs->fence, struct hl_cs_compl, base_fence);
682 cs->completed = true;
685 * Although if we reached here it means that all external jobs have
686 * finished, because each one of them took refcnt to CS, we still
687 * need to go over the internal jobs and complete them. Otherwise, we
688 * will have leaked memory and what's worse, the CS object (and
689 * potentially the CTX object) could be released, while the JOB
690 * still holds a pointer to them (but no reference).
692 list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
693 hl_complete_job(hdev, job);
695 if (!cs->submitted) {
697 * In case the wait for signal CS was submitted, the fence put
698 * occurs in init_signal_wait_cs() or collective_wait_init_cs()
699 * right before hanging on the PQ.
701 if (cs->type == CS_TYPE_WAIT ||
702 cs->type == CS_TYPE_COLLECTIVE_WAIT)
703 hl_fence_put(cs->signal_fence);
708 /* Need to update CI for all queue jobs that does not get completion */
709 hl_hw_queue_update_ci(cs);
711 /* remove CS from CS mirror list */
712 spin_lock(&hdev->cs_mirror_lock);
713 list_del_init(&cs->mirror_node);
714 spin_unlock(&hdev->cs_mirror_lock);
716 cs_handle_tdr(hdev, cs);
719 /* the completion CS decrements reference for the entire
722 if (cs->staged_last) {
723 struct hl_cs *staged_cs, *tmp_cs;
725 list_for_each_entry_safe(staged_cs, tmp_cs,
726 &cs->staged_cs_node, staged_cs_node)
727 staged_cs_put(hdev, staged_cs);
730 /* A staged CS will be a member in the list only after it
731 * was submitted. We used 'cs_mirror_lock' when inserting
732 * it to list so we will use it again when removing it
735 spin_lock(&hdev->cs_mirror_lock);
736 list_del(&cs->staged_cs_node);
737 spin_unlock(&hdev->cs_mirror_lock);
740 /* decrement refcount to handle when first staged cs
741 * with encaps signals is completed.
743 if (hl_cs_cmpl->encaps_signals)
744 kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
745 hl_encaps_release_handle_and_put_ctx);
748 if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
749 kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
752 /* Must be called before hl_ctx_put because inside we use ctx to get
755 hl_debugfs_remove_cs(cs);
757 hdev->shadow_cs_queue[cs->sequence & (hdev->asic_prop.max_pending_cs - 1)] = NULL;
759 /* We need to mark an error for not submitted because in that case
760 * the hl fence release flow is different. Mainly, we don't need
761 * to handle hw_sob for signal/wait
764 cs->fence->error = -ETIMEDOUT;
765 else if (cs->aborted)
766 cs->fence->error = -EIO;
767 else if (!cs->submitted)
768 cs->fence->error = -EBUSY;
770 if (unlikely(cs->skip_reset_on_timeout)) {
772 "Command submission %llu completed after %llu (s)\n",
774 div_u64(jiffies - cs->submission_time_jiffies, HZ));
778 cs->fence->timestamp = ktime_get();
779 hl_push_cs_outcome(hdev, &cs->ctx->outcome_store, cs->sequence,
780 cs->fence->timestamp, cs->fence->error);
785 complete_all(&cs->fence->completion);
786 complete_multi_cs(hdev, cs);
788 cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
790 hl_fence_put(cs->fence);
792 kfree(cs->jobs_in_queue_cnt);
796 static void cs_timedout(struct work_struct *work)
798 struct hl_device *hdev;
799 u64 event_mask = 0x0;
801 struct hl_cs *cs = container_of(work, struct hl_cs,
803 bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
805 rc = cs_get_unless_zero(cs);
809 if ((!cs->submitted) || (cs->completed)) {
814 hdev = cs->ctx->hdev;
816 if (likely(!skip_reset_on_timeout)) {
817 if (hdev->reset_on_lockup)
820 hdev->reset_info.needs_reset = true;
822 /* Mark the CS is timed out so we won't try to cancel its TDR */
826 /* Save only the first CS timeout parameters */
827 rc = atomic_cmpxchg(&hdev->captured_err_info.cs_timeout.write_enable, 1, 0);
829 hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
830 hdev->captured_err_info.cs_timeout.seq = cs->sequence;
831 event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
837 "Signal command submission %llu has not finished in time!\n",
843 "Wait command submission %llu has not finished in time!\n",
847 case CS_TYPE_COLLECTIVE_WAIT:
849 "Collective Wait command submission %llu has not finished in time!\n",
855 "Command submission %llu has not finished in time!\n",
860 rc = hl_state_dump(hdev);
862 dev_err(hdev->dev, "Error during system state dump %d\n", rc);
867 event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
868 hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
869 } else if (event_mask) {
870 hl_notifier_event_send_all(hdev, event_mask);
874 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
875 enum hl_cs_type cs_type, u64 user_sequence,
876 struct hl_cs **cs_new, u32 flags, u32 timeout)
878 struct hl_cs_counters_atomic *cntr;
879 struct hl_fence *other = NULL;
880 struct hl_cs_compl *cs_cmpl;
884 cntr = &hdev->aggregated_cs_counters;
886 cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
888 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
891 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
892 atomic64_inc(&cntr->out_of_mem_drop_cnt);
896 /* increment refcnt for context */
900 cs->submitted = false;
901 cs->completed = false;
903 cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
904 cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
905 cs->timeout_jiffies = timeout;
906 cs->skip_reset_on_timeout =
907 hdev->reset_info.skip_reset_on_timeout ||
908 !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
909 cs->submission_time_jiffies = jiffies;
910 INIT_LIST_HEAD(&cs->job_list);
911 INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
912 kref_init(&cs->refcount);
913 spin_lock_init(&cs->job_lock);
915 cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
917 cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL);
920 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
921 atomic64_inc(&cntr->out_of_mem_drop_cnt);
926 cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
927 sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
928 if (!cs->jobs_in_queue_cnt)
929 cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
930 sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
932 if (!cs->jobs_in_queue_cnt) {
933 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
934 atomic64_inc(&cntr->out_of_mem_drop_cnt);
939 cs_cmpl->hdev = hdev;
940 cs_cmpl->type = cs->type;
941 spin_lock_init(&cs_cmpl->lock);
942 cs->fence = &cs_cmpl->base_fence;
944 spin_lock(&ctx->cs_lock);
946 cs_cmpl->cs_seq = ctx->cs_sequence;
947 other = ctx->cs_pending[cs_cmpl->cs_seq &
948 (hdev->asic_prop.max_pending_cs - 1)];
950 if (other && !completion_done(&other->completion)) {
951 /* If the following statement is true, it means we have reached
952 * a point in which only part of the staged submission was
953 * submitted and we don't have enough room in the 'cs_pending'
954 * array for the rest of the submission.
955 * This causes a deadlock because this CS will never be
956 * completed as it depends on future CS's for completion.
958 if (other->cs_sequence == user_sequence)
959 dev_crit_ratelimited(hdev->dev,
960 "Staged CS %llu deadlock due to lack of resources",
963 dev_dbg_ratelimited(hdev->dev,
964 "Rejecting CS because of too many in-flights CS\n");
965 atomic64_inc(&ctx->cs_counters.max_cs_in_flight_drop_cnt);
966 atomic64_inc(&cntr->max_cs_in_flight_drop_cnt);
972 hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq);
974 cs->sequence = cs_cmpl->cs_seq;
976 ctx->cs_pending[cs_cmpl->cs_seq &
977 (hdev->asic_prop.max_pending_cs - 1)] =
978 &cs_cmpl->base_fence;
981 hl_fence_get(&cs_cmpl->base_fence);
985 spin_unlock(&ctx->cs_lock);
992 spin_unlock(&ctx->cs_lock);
993 kfree(cs->jobs_in_queue_cnt);
1002 static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
1004 struct hl_cs_job *job, *tmp;
1006 staged_cs_put(hdev, cs);
1008 list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1009 hl_complete_job(hdev, job);
1013 * release_reserved_encaps_signals() - release reserved encapsulated signals.
1014 * @hdev: pointer to habanalabs device structure
1016 * Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
1017 * encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
1018 * For these signals need also to put the refcount of the H/W SOB which was taken at the
1021 static void release_reserved_encaps_signals(struct hl_device *hdev)
1023 struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
1024 struct hl_cs_encaps_sig_handle *handle;
1025 struct hl_encaps_signals_mgr *mgr;
1031 mgr = &ctx->sig_mgr;
1033 idr_for_each_entry(&mgr->handles, handle, id)
1034 if (handle->cs_seq == ULLONG_MAX)
1035 kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
1040 void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
1043 struct hl_cs *cs, *tmp;
1045 if (!skip_wq_flush) {
1046 flush_workqueue(hdev->ts_free_obj_wq);
1048 /* flush all completions before iterating over the CS mirror list in
1049 * order to avoid a race with the release functions
1051 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1052 flush_workqueue(hdev->cq_wq[i]);
1054 flush_workqueue(hdev->cs_cmplt_wq);
1057 /* Make sure we don't have leftovers in the CS mirror list */
1058 list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
1061 dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
1062 cs->ctx->asid, cs->sequence);
1063 cs_rollback(hdev, cs);
1067 force_complete_multi_cs(hdev);
1069 release_reserved_encaps_signals(hdev);
1073 wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
1075 struct hl_user_pending_interrupt *pend, *temp;
1076 unsigned long flags;
1078 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
1079 list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) {
1080 if (pend->ts_reg_info.buf) {
1081 list_del(&pend->wait_list_node);
1082 hl_mmap_mem_buf_put(pend->ts_reg_info.buf);
1083 hl_cb_put(pend->ts_reg_info.cq_cb);
1085 pend->fence.error = -EIO;
1086 complete_all(&pend->fence.completion);
1089 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
1092 void hl_release_pending_user_interrupts(struct hl_device *hdev)
1094 struct asic_fixed_properties *prop = &hdev->asic_prop;
1095 struct hl_user_interrupt *interrupt;
1098 if (!prop->user_interrupt_count)
1101 /* We iterate through the user interrupt requests and waking up all
1102 * user threads waiting for interrupt completion. We iterate the
1103 * list under a lock, this is why all user threads, once awake,
1104 * will wait on the same lock and will release the waiting object upon
1108 for (i = 0 ; i < prop->user_interrupt_count ; i++) {
1109 interrupt = &hdev->user_interrupt[i];
1110 wake_pending_user_interrupt_threads(interrupt);
1113 interrupt = &hdev->common_user_cq_interrupt;
1114 wake_pending_user_interrupt_threads(interrupt);
1116 interrupt = &hdev->common_decoder_interrupt;
1117 wake_pending_user_interrupt_threads(interrupt);
1120 static void force_complete_cs(struct hl_device *hdev)
1124 spin_lock(&hdev->cs_mirror_lock);
1126 list_for_each_entry(cs, &hdev->cs_mirror_list, mirror_node) {
1127 cs->fence->error = -EIO;
1128 complete_all(&cs->fence->completion);
1131 spin_unlock(&hdev->cs_mirror_lock);
1134 void hl_abort_waitings_for_completion(struct hl_device *hdev)
1136 force_complete_cs(hdev);
1137 force_complete_multi_cs(hdev);
1138 hl_release_pending_user_interrupts(hdev);
1141 static void job_wq_completion(struct work_struct *work)
1143 struct hl_cs_job *job = container_of(work, struct hl_cs_job,
1145 struct hl_cs *cs = job->cs;
1146 struct hl_device *hdev = cs->ctx->hdev;
1148 /* job is no longer needed */
1149 hl_complete_job(hdev, job);
1152 static void cs_completion(struct work_struct *work)
1154 struct hl_cs *cs = container_of(work, struct hl_cs, finish_work);
1155 struct hl_device *hdev = cs->ctx->hdev;
1156 struct hl_cs_job *job, *tmp;
1158 list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1159 hl_complete_job(hdev, job);
1162 static int validate_queue_index(struct hl_device *hdev,
1163 struct hl_cs_chunk *chunk,
1164 enum hl_queue_type *queue_type,
1165 bool *is_kernel_allocated_cb)
1167 struct asic_fixed_properties *asic = &hdev->asic_prop;
1168 struct hw_queue_properties *hw_queue_prop;
1170 /* This must be checked here to prevent out-of-bounds access to
1171 * hw_queues_props array
1173 if (chunk->queue_index >= asic->max_queues) {
1174 dev_err(hdev->dev, "Queue index %d is invalid\n",
1175 chunk->queue_index);
1179 hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
1181 if (hw_queue_prop->type == QUEUE_TYPE_NA) {
1182 dev_err(hdev->dev, "Queue index %d is not applicable\n",
1183 chunk->queue_index);
1187 if (hw_queue_prop->binned) {
1188 dev_err(hdev->dev, "Queue index %d is binned out\n",
1189 chunk->queue_index);
1193 if (hw_queue_prop->driver_only) {
1195 "Queue index %d is restricted for the kernel driver\n",
1196 chunk->queue_index);
1200 /* When hw queue type isn't QUEUE_TYPE_HW,
1201 * USER_ALLOC_CB flag shall be referred as "don't care".
1203 if (hw_queue_prop->type == QUEUE_TYPE_HW) {
1204 if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
1205 if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
1207 "Queue index %d doesn't support user CB\n",
1208 chunk->queue_index);
1212 *is_kernel_allocated_cb = false;
1214 if (!(hw_queue_prop->cb_alloc_flags &
1217 "Queue index %d doesn't support kernel CB\n",
1218 chunk->queue_index);
1222 *is_kernel_allocated_cb = true;
1225 *is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
1229 *queue_type = hw_queue_prop->type;
1233 static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
1234 struct hl_mem_mgr *mmg,
1235 struct hl_cs_chunk *chunk)
1239 cb = hl_cb_get(mmg, chunk->cb_handle);
1241 dev_err(hdev->dev, "CB handle 0x%llx invalid\n", chunk->cb_handle);
1245 if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
1246 dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
1250 atomic_inc(&cb->cs_cnt);
1259 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
1260 enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
1262 struct hl_cs_job *job;
1264 job = kzalloc(sizeof(*job), GFP_ATOMIC);
1266 job = kzalloc(sizeof(*job), GFP_KERNEL);
1271 kref_init(&job->refcount);
1272 job->queue_type = queue_type;
1273 job->is_kernel_allocated_cb = is_kernel_allocated_cb;
1275 if (is_cb_patched(hdev, job))
1276 INIT_LIST_HEAD(&job->userptr_list);
1278 if (job->queue_type == QUEUE_TYPE_EXT)
1279 INIT_WORK(&job->finish_work, job_wq_completion);
1284 static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
1286 if (cs_type_flags & HL_CS_FLAGS_SIGNAL)
1287 return CS_TYPE_SIGNAL;
1288 else if (cs_type_flags & HL_CS_FLAGS_WAIT)
1289 return CS_TYPE_WAIT;
1290 else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
1291 return CS_TYPE_COLLECTIVE_WAIT;
1292 else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
1293 return CS_RESERVE_SIGNALS;
1294 else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
1295 return CS_UNRESERVE_SIGNALS;
1296 else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
1297 return CS_TYPE_ENGINE_CORE;
1299 return CS_TYPE_DEFAULT;
1302 static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
1304 struct hl_device *hdev = hpriv->hdev;
1305 struct hl_ctx *ctx = hpriv->ctx;
1306 u32 cs_type_flags, num_chunks;
1307 enum hl_device_status status;
1308 enum hl_cs_type cs_type;
1309 bool is_sync_stream;
1311 if (!hl_device_operational(hdev, &status)) {
1315 if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1316 !hdev->supports_staged_submission) {
1317 dev_err(hdev->dev, "staged submission not supported");
1321 cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
1323 if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
1325 "CS type flags are mutually exclusive, context %d\n",
1330 cs_type = hl_cs_get_cs_type(cs_type_flags);
1331 num_chunks = args->in.num_chunks_execute;
1333 is_sync_stream = (cs_type == CS_TYPE_SIGNAL || cs_type == CS_TYPE_WAIT ||
1334 cs_type == CS_TYPE_COLLECTIVE_WAIT);
1336 if (unlikely(is_sync_stream && !hdev->supports_sync_stream)) {
1337 dev_err(hdev->dev, "Sync stream CS is not supported\n");
1341 if (cs_type == CS_TYPE_DEFAULT) {
1343 dev_err(hdev->dev, "Got execute CS with 0 chunks, context %d\n", ctx->asid);
1346 } else if (is_sync_stream && num_chunks != 1) {
1348 "Sync stream CS mandates one chunk only, context %d\n",
1356 static int hl_cs_copy_chunk_array(struct hl_device *hdev,
1357 struct hl_cs_chunk **cs_chunk_array,
1358 void __user *chunks, u32 num_chunks,
1363 if (num_chunks > HL_MAX_JOBS_PER_CS) {
1364 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1365 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1367 "Number of chunks can NOT be larger than %d\n",
1368 HL_MAX_JOBS_PER_CS);
1372 *cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array),
1374 if (!*cs_chunk_array)
1375 *cs_chunk_array = kmalloc_array(num_chunks,
1376 sizeof(**cs_chunk_array), GFP_KERNEL);
1377 if (!*cs_chunk_array) {
1378 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1379 atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1383 size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
1384 if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) {
1385 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1386 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1387 dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
1388 kfree(*cs_chunk_array);
1395 static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
1396 u64 sequence, u32 flags,
1397 u32 encaps_signal_handle)
1399 if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
1402 cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST);
1403 cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST);
1405 if (cs->staged_first) {
1406 /* Staged CS sequence is the first CS sequence */
1407 INIT_LIST_HEAD(&cs->staged_cs_node);
1408 cs->staged_sequence = cs->sequence;
1410 if (cs->encaps_signals)
1411 cs->encaps_sig_hdl_id = encaps_signal_handle;
1413 /* User sequence will be validated in 'hl_hw_queue_schedule_cs'
1414 * under the cs_mirror_lock
1416 cs->staged_sequence = sequence;
1419 /* Increment CS reference if needed */
1420 staged_cs_get(hdev, cs);
1422 cs->staged_cs = true;
1427 static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
1431 for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
1432 if (qid == hdev->stream_master_qid_arr[i])
1438 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
1439 u32 num_chunks, u64 *cs_seq, u32 flags,
1440 u32 encaps_signals_handle, u32 timeout,
1441 u16 *signal_initial_sob_count)
1443 bool staged_mid, int_queues_only = true, using_hw_queues = false;
1444 struct hl_device *hdev = hpriv->hdev;
1445 struct hl_cs_chunk *cs_chunk_array;
1446 struct hl_cs_counters_atomic *cntr;
1447 struct hl_ctx *ctx = hpriv->ctx;
1448 struct hl_cs_job *job;
1452 u8 stream_master_qid_map = 0;
1455 cntr = &hdev->aggregated_cs_counters;
1456 user_sequence = *cs_seq;
1457 *cs_seq = ULLONG_MAX;
1459 rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
1464 if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1465 !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
1470 rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT,
1471 staged_mid ? user_sequence : ULLONG_MAX, &cs, flags,
1474 goto free_cs_chunk_array;
1476 *cs_seq = cs->sequence;
1478 hl_debugfs_add_cs(cs);
1480 rc = cs_staged_submission(hdev, cs, user_sequence, flags,
1481 encaps_signals_handle);
1483 goto free_cs_object;
1485 /* If this is a staged submission we must return the staged sequence
1486 * rather than the internal CS sequence
1489 *cs_seq = cs->staged_sequence;
1491 /* Validate ALL the CS chunks before submitting the CS */
1492 for (i = 0 ; i < num_chunks ; i++) {
1493 struct hl_cs_chunk *chunk = &cs_chunk_array[i];
1494 enum hl_queue_type queue_type;
1495 bool is_kernel_allocated_cb;
1497 rc = validate_queue_index(hdev, chunk, &queue_type,
1498 &is_kernel_allocated_cb);
1500 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1501 atomic64_inc(&cntr->validation_drop_cnt);
1502 goto free_cs_object;
1505 if (is_kernel_allocated_cb) {
1506 cb = get_cb_from_cs_chunk(hdev, &hpriv->mem_mgr, chunk);
1509 &ctx->cs_counters.validation_drop_cnt);
1510 atomic64_inc(&cntr->validation_drop_cnt);
1512 goto free_cs_object;
1515 cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
1518 if (queue_type == QUEUE_TYPE_EXT ||
1519 queue_type == QUEUE_TYPE_HW) {
1520 int_queues_only = false;
1523 * store which stream are being used for external/HW
1526 if (hdev->supports_wait_for_multi_cs)
1527 stream_master_qid_map |=
1528 get_stream_master_qid_mask(hdev,
1529 chunk->queue_index);
1532 if (queue_type == QUEUE_TYPE_HW)
1533 using_hw_queues = true;
1535 job = hl_cs_allocate_job(hdev, queue_type,
1536 is_kernel_allocated_cb);
1538 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1539 atomic64_inc(&cntr->out_of_mem_drop_cnt);
1540 dev_err(hdev->dev, "Failed to allocate a new job\n");
1542 if (is_kernel_allocated_cb)
1545 goto free_cs_object;
1551 job->user_cb_size = chunk->cb_size;
1552 job->hw_queue_id = chunk->queue_index;
1554 cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1557 list_add_tail(&job->cs_node, &cs->job_list);
1560 * Increment CS reference. When CS reference is 0, CS is
1561 * done and can be signaled to user and free all its resources
1562 * Only increment for JOB on external or H/W queues, because
1563 * only for those JOBs we get completion
1565 if (cs_needs_completion(cs) &&
1566 (job->queue_type == QUEUE_TYPE_EXT ||
1567 job->queue_type == QUEUE_TYPE_HW))
1570 hl_debugfs_add_job(hdev, job);
1572 rc = cs_parser(hpriv, job);
1574 atomic64_inc(&ctx->cs_counters.parsing_drop_cnt);
1575 atomic64_inc(&cntr->parsing_drop_cnt);
1577 "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
1578 cs->ctx->asid, cs->sequence, job->id, rc);
1579 goto free_cs_object;
1583 /* We allow a CS with any queue type combination as long as it does
1584 * not get a completion
1586 if (int_queues_only && cs_needs_completion(cs)) {
1587 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1588 atomic64_inc(&cntr->validation_drop_cnt);
1590 "Reject CS %d.%llu since it contains only internal queues jobs and needs completion\n",
1591 cs->ctx->asid, cs->sequence);
1593 goto free_cs_object;
1596 if (using_hw_queues)
1597 INIT_WORK(&cs->finish_work, cs_completion);
1600 * store the (external/HW queues) streams used by the CS in the
1601 * fence object for multi-CS completion
1603 if (hdev->supports_wait_for_multi_cs)
1604 cs->fence->stream_master_qid_map = stream_master_qid_map;
1606 rc = hl_hw_queue_schedule_cs(cs);
1610 "Failed to submit CS %d.%llu to H/W queues, error %d\n",
1611 cs->ctx->asid, cs->sequence, rc);
1612 goto free_cs_object;
1615 *signal_initial_sob_count = cs->initial_sob_count;
1617 rc = HL_CS_STATUS_SUCCESS;
1621 atomic_dec(&cb->cs_cnt);
1624 cs_rollback(hdev, cs);
1625 *cs_seq = ULLONG_MAX;
1626 /* The path below is both for good and erroneous exits */
1628 /* We finished with the CS in this function, so put the ref */
1630 free_cs_chunk_array:
1631 kfree(cs_chunk_array);
1636 static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
1639 struct hl_device *hdev = hpriv->hdev;
1640 struct hl_ctx *ctx = hpriv->ctx;
1641 bool need_soft_reset = false;
1642 int rc = 0, do_ctx_switch = 0;
1643 void __user *chunks;
1644 u32 num_chunks, tmp;
1648 if (hdev->supports_ctx_switch)
1649 do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
1651 if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
1652 mutex_lock(&hpriv->restore_phase_mutex);
1654 if (do_ctx_switch) {
1655 rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
1657 dev_err_ratelimited(hdev->dev,
1658 "Failed to switch to context %d, rejecting CS! %d\n",
1661 * If we timedout, or if the device is not IDLE
1662 * while we want to do context-switch (-EBUSY),
1663 * we need to soft-reset because QMAN is
1664 * probably stuck. However, we can't call to
1665 * reset here directly because of deadlock, so
1666 * need to do it at the very end of this
1669 if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
1670 need_soft_reset = true;
1671 mutex_unlock(&hpriv->restore_phase_mutex);
1676 hdev->asic_funcs->restore_phase_topology(hdev);
1678 chunks = (void __user *) (uintptr_t) args->in.chunks_restore;
1679 num_chunks = args->in.num_chunks_restore;
1683 "Need to run restore phase but restore CS is empty\n");
1686 rc = cs_ioctl_default(hpriv, chunks, num_chunks,
1687 cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count);
1690 mutex_unlock(&hpriv->restore_phase_mutex);
1694 "Failed to submit restore CS for context %d (%d)\n",
1699 /* Need to wait for restore completion before execution phase */
1701 enum hl_cs_wait_status status;
1703 ret = _hl_cs_wait_ioctl(hdev, ctx,
1704 jiffies_to_usecs(hdev->timeout_jiffies),
1705 *cs_seq, &status, NULL);
1707 if (ret == -ERESTARTSYS) {
1708 usleep_range(100, 200);
1713 "Restore CS for context %d failed to complete %d\n",
1720 if (hdev->supports_ctx_switch)
1721 ctx->thread_ctx_switch_wait_token = 1;
1723 } else if (hdev->supports_ctx_switch && !ctx->thread_ctx_switch_wait_token) {
1724 rc = hl_poll_timeout_memory(hdev,
1725 &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
1726 100, jiffies_to_usecs(hdev->timeout_jiffies), false);
1728 if (rc == -ETIMEDOUT) {
1730 "context switch phase timeout (%d)\n", tmp);
1736 if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset))
1737 hl_device_reset(hdev, 0);
1743 * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
1744 * if the SOB value reaches the max value move to the other SOB reserved
1746 * @hdev: pointer to device structure
1747 * @q_idx: stream queue index
1748 * @hw_sob: the H/W SOB used in this signal CS.
1749 * @count: signals count
1750 * @encaps_sig: tells whether it's reservation for encaps signals or not.
1752 * Note that this function must be called while hw_queues_lock is taken.
1754 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
1755 struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
1758 struct hl_sync_stream_properties *prop;
1759 struct hl_hw_sob *sob = *hw_sob, *other_sob;
1760 u8 other_sob_offset;
1762 prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1766 /* check for wraparound */
1767 if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
1769 * Decrement as we reached the max value.
1770 * The release function won't be called here as we've
1771 * just incremented the refcount right before calling this
1774 hw_sob_put_err(sob);
1777 * check the other sob value, if it still in use then fail
1778 * otherwise make the switch
1780 other_sob_offset = (prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
1781 other_sob = &prop->hw_sob[other_sob_offset];
1783 if (kref_read(&other_sob->kref) != 1) {
1784 dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n",
1790 * next_sob_val always points to the next available signal
1791 * in the sob, so in encaps signals it will be the next one
1792 * after reserving the required amount.
1795 prop->next_sob_val = count + 1;
1797 prop->next_sob_val = count;
1799 /* only two SOBs are currently in use */
1800 prop->curr_sob_offset = other_sob_offset;
1801 *hw_sob = other_sob;
1804 * check if other_sob needs reset, then do it before using it
1805 * for the reservation or the next signal cs.
1806 * we do it here, and for both encaps and regular signal cs
1807 * cases in order to avoid possible races of two kref_put
1808 * of the sob which can occur at the same time if we move the
1809 * sob reset(kref_put) to cs_do_release function.
1810 * in addition, if we have combination of cs signal and
1811 * encaps, and at the point we need to reset the sob there was
1812 * no more reservations and only signal cs keep coming,
1813 * in such case we need signal_cs to put the refcount and
1816 if (other_sob->need_reset)
1817 hw_sob_put(other_sob);
1820 /* set reset indication for the sob */
1821 sob->need_reset = true;
1822 hw_sob_get(other_sob);
1825 dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
1826 prop->curr_sob_offset, q_idx);
1828 prop->next_sob_val += count;
1834 static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
1835 struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx,
1836 bool encaps_signals)
1838 u64 *signal_seq_arr = NULL;
1839 u32 size_to_copy, signal_seq_arr_len;
1842 if (encaps_signals) {
1843 *signal_seq = chunk->encaps_signal_seq;
1847 signal_seq_arr_len = chunk->num_signal_seq_arr;
1849 /* currently only one signal seq is supported */
1850 if (signal_seq_arr_len != 1) {
1851 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1852 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1854 "Wait for signal CS supports only one signal CS seq\n");
1858 signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1859 sizeof(*signal_seq_arr),
1861 if (!signal_seq_arr)
1862 signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1863 sizeof(*signal_seq_arr),
1865 if (!signal_seq_arr) {
1866 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1867 atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1871 size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
1872 if (copy_from_user(signal_seq_arr,
1873 u64_to_user_ptr(chunk->signal_seq_arr),
1875 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1876 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1878 "Failed to copy signal seq array from user\n");
1883 /* currently it is guaranteed to have only one signal seq */
1884 *signal_seq = signal_seq_arr[0];
1887 kfree(signal_seq_arr);
1892 static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
1893 struct hl_ctx *ctx, struct hl_cs *cs,
1894 enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
1896 struct hl_cs_counters_atomic *cntr;
1897 struct hl_cs_job *job;
1901 cntr = &hdev->aggregated_cs_counters;
1903 job = hl_cs_allocate_job(hdev, q_type, true);
1905 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1906 atomic64_inc(&cntr->out_of_mem_drop_cnt);
1907 dev_err(hdev->dev, "Failed to allocate a new job\n");
1911 if (cs->type == CS_TYPE_WAIT)
1912 cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
1914 cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
1916 cb = hl_cb_kernel_create(hdev, cb_size,
1917 q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
1919 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1920 atomic64_inc(&cntr->out_of_mem_drop_cnt);
1928 atomic_inc(&job->user_cb->cs_cnt);
1929 job->user_cb_size = cb_size;
1930 job->hw_queue_id = q_idx;
1932 if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
1933 && cs->encaps_signals)
1934 job->encaps_sig_wait_offset = encaps_signal_offset;
1936 * No need in parsing, user CB is the patched CB.
1937 * We call hl_cb_destroy() out of two reasons - we don't need the CB in
1938 * the CB idr anymore and to decrement its refcount as it was
1939 * incremented inside hl_cb_kernel_create().
1941 job->patched_cb = job->user_cb;
1942 job->job_cb_size = job->user_cb_size;
1943 hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle);
1945 /* increment refcount as for external queues we get completion */
1948 cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1951 list_add_tail(&job->cs_node, &cs->job_list);
1953 hl_debugfs_add_job(hdev, job);
1958 static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
1959 u32 q_idx, u32 count,
1960 u32 *handle_id, u32 *sob_addr,
1963 struct hw_queue_properties *hw_queue_prop;
1964 struct hl_sync_stream_properties *prop;
1965 struct hl_device *hdev = hpriv->hdev;
1966 struct hl_cs_encaps_sig_handle *handle;
1967 struct hl_encaps_signals_mgr *mgr;
1968 struct hl_hw_sob *hw_sob;
1972 if (count >= HL_MAX_SOB_VAL) {
1973 dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
1979 if (q_idx >= hdev->asic_prop.max_queues) {
1980 dev_err(hdev->dev, "Queue index %d is invalid\n",
1986 hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
1988 if (!hw_queue_prop->supports_sync_stream) {
1990 "Queue index %d does not support sync stream operations\n",
1996 prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1998 handle = kzalloc(sizeof(*handle), GFP_KERNEL);
2004 handle->count = count;
2006 hl_ctx_get(hpriv->ctx);
2007 handle->ctx = hpriv->ctx;
2008 mgr = &hpriv->ctx->sig_mgr;
2010 spin_lock(&mgr->lock);
2011 hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC);
2012 spin_unlock(&mgr->lock);
2015 dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
2020 handle->id = hdl_id;
2021 handle->q_idx = q_idx;
2022 handle->hdev = hdev;
2023 kref_init(&handle->refcount);
2025 hdev->asic_funcs->hw_queues_lock(hdev);
2027 hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2030 * Increment the SOB value by count by user request
2031 * to reserve those signals
2032 * check if the signals amount to reserve is not exceeding the max sob
2033 * value, if yes then switch sob.
2035 rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
2038 dev_err(hdev->dev, "Failed to switch SOB\n");
2039 hdev->asic_funcs->hw_queues_unlock(hdev);
2043 /* set the hw_sob to the handle after calling the sob wraparound handler
2044 * since sob could have changed.
2046 handle->hw_sob = hw_sob;
2048 /* store the current sob value for unreserve validity check, and
2049 * signal offset support
2051 handle->pre_sob_val = prop->next_sob_val - handle->count;
2053 handle->cs_seq = ULLONG_MAX;
2055 *signals_count = prop->next_sob_val;
2056 hdev->asic_funcs->hw_queues_unlock(hdev);
2058 *sob_addr = handle->hw_sob->sob_addr;
2059 *handle_id = hdl_id;
2062 "Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
2063 hw_sob->sob_id, handle->hw_sob->sob_addr,
2064 prop->next_sob_val - 1, q_idx, hdl_id);
2068 spin_lock(&mgr->lock);
2069 idr_remove(&mgr->handles, hdl_id);
2070 spin_unlock(&mgr->lock);
2073 hl_ctx_put(handle->ctx);
2080 static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
2082 struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
2083 struct hl_sync_stream_properties *prop;
2084 struct hl_device *hdev = hpriv->hdev;
2085 struct hl_encaps_signals_mgr *mgr;
2086 struct hl_hw_sob *hw_sob;
2087 u32 q_idx, sob_addr;
2090 mgr = &hpriv->ctx->sig_mgr;
2092 spin_lock(&mgr->lock);
2093 encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
2094 if (encaps_sig_hdl) {
2095 dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
2096 handle_id, encaps_sig_hdl->hw_sob->sob_addr,
2097 encaps_sig_hdl->count);
2099 hdev->asic_funcs->hw_queues_lock(hdev);
2101 q_idx = encaps_sig_hdl->q_idx;
2102 prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
2103 hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2104 sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
2106 /* Check if sob_val got out of sync due to other
2107 * signal submission requests which were handled
2108 * between the reserve-unreserve calls or SOB switch
2109 * upon reaching SOB max value.
2111 if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
2112 != prop->next_sob_val ||
2113 sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
2114 dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
2115 encaps_sig_hdl->pre_sob_val,
2116 (prop->next_sob_val - encaps_sig_hdl->count));
2118 hdev->asic_funcs->hw_queues_unlock(hdev);
2124 * Decrement the SOB value by count by user request
2125 * to unreserve those signals
2127 prop->next_sob_val -= encaps_sig_hdl->count;
2129 hdev->asic_funcs->hw_queues_unlock(hdev);
2133 /* Release the id and free allocated memory of the handle */
2134 idr_remove(&mgr->handles, handle_id);
2135 hl_ctx_put(encaps_sig_hdl->ctx);
2136 kfree(encaps_sig_hdl);
2139 dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
2142 spin_unlock(&mgr->lock);
2147 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
2148 void __user *chunks, u32 num_chunks,
2149 u64 *cs_seq, u32 flags, u32 timeout,
2150 u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count)
2152 struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
2153 bool handle_found = false, is_wait_cs = false,
2154 wait_cs_submitted = false,
2155 cs_encaps_signals = false;
2156 struct hl_cs_chunk *cs_chunk_array, *chunk;
2157 bool staged_cs_with_encaps_signals = false;
2158 struct hw_queue_properties *hw_queue_prop;
2159 struct hl_device *hdev = hpriv->hdev;
2160 struct hl_cs_compl *sig_waitcs_cmpl;
2161 u32 q_idx, collective_engine_id = 0;
2162 struct hl_cs_counters_atomic *cntr;
2163 struct hl_fence *sig_fence = NULL;
2164 struct hl_ctx *ctx = hpriv->ctx;
2165 enum hl_queue_type q_type;
2170 cntr = &hdev->aggregated_cs_counters;
2171 *cs_seq = ULLONG_MAX;
2173 rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
2178 /* currently it is guaranteed to have only one chunk */
2179 chunk = &cs_chunk_array[0];
2181 if (chunk->queue_index >= hdev->asic_prop.max_queues) {
2182 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2183 atomic64_inc(&cntr->validation_drop_cnt);
2184 dev_err(hdev->dev, "Queue index %d is invalid\n",
2185 chunk->queue_index);
2187 goto free_cs_chunk_array;
2190 q_idx = chunk->queue_index;
2191 hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
2192 q_type = hw_queue_prop->type;
2194 if (!hw_queue_prop->supports_sync_stream) {
2195 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2196 atomic64_inc(&cntr->validation_drop_cnt);
2198 "Queue index %d does not support sync stream operations\n",
2201 goto free_cs_chunk_array;
2204 if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
2205 if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
2206 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2207 atomic64_inc(&cntr->validation_drop_cnt);
2209 "Queue index %d is invalid\n", q_idx);
2211 goto free_cs_chunk_array;
2214 if (!hdev->nic_ports_mask) {
2215 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2216 atomic64_inc(&cntr->validation_drop_cnt);
2218 "Collective operations not supported when NIC ports are disabled");
2220 goto free_cs_chunk_array;
2223 collective_engine_id = chunk->collective_engine_id;
2226 is_wait_cs = !!(cs_type == CS_TYPE_WAIT ||
2227 cs_type == CS_TYPE_COLLECTIVE_WAIT);
2229 cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
2232 rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq,
2233 ctx, cs_encaps_signals);
2235 goto free_cs_chunk_array;
2237 if (cs_encaps_signals) {
2238 /* check if cs sequence has encapsulated
2244 spin_lock(&ctx->sig_mgr.lock);
2245 idp = &ctx->sig_mgr.handles;
2246 idr_for_each_entry(idp, encaps_sig_hdl, id) {
2247 if (encaps_sig_hdl->cs_seq == signal_seq) {
2248 /* get refcount to protect removing this handle from idr,
2249 * needed when multiple wait cs are used with offset
2250 * to wait on reserved encaps signals.
2251 * Since kref_put of this handle is executed outside the
2252 * current lock, it is possible that the handle refcount
2253 * is 0 but it yet to be removed from the list. In this
2254 * case need to consider the handle as not valid.
2256 if (kref_get_unless_zero(&encaps_sig_hdl->refcount))
2257 handle_found = true;
2261 spin_unlock(&ctx->sig_mgr.lock);
2263 if (!handle_found) {
2264 /* treat as signal CS already finished */
2265 dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
2268 goto free_cs_chunk_array;
2271 /* validate also the signal offset value */
2272 if (chunk->encaps_signal_offset >
2273 encaps_sig_hdl->count) {
2274 dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
2275 chunk->encaps_signal_offset,
2276 encaps_sig_hdl->count);
2278 goto free_cs_chunk_array;
2282 sig_fence = hl_ctx_get_fence(ctx, signal_seq);
2283 if (IS_ERR(sig_fence)) {
2284 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2285 atomic64_inc(&cntr->validation_drop_cnt);
2287 "Failed to get signal CS with seq 0x%llx\n",
2289 rc = PTR_ERR(sig_fence);
2290 goto free_cs_chunk_array;
2294 /* signal CS already finished */
2296 goto free_cs_chunk_array;
2300 container_of(sig_fence, struct hl_cs_compl, base_fence);
2302 staged_cs_with_encaps_signals = !!
2303 (sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
2304 (flags & HL_CS_FLAGS_ENCAP_SIGNALS));
2306 if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
2307 !staged_cs_with_encaps_signals) {
2308 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2309 atomic64_inc(&cntr->validation_drop_cnt);
2311 "CS seq 0x%llx is not of a signal/encaps-signal CS\n",
2313 hl_fence_put(sig_fence);
2315 goto free_cs_chunk_array;
2318 if (completion_done(&sig_fence->completion)) {
2319 /* signal CS already finished */
2320 hl_fence_put(sig_fence);
2322 goto free_cs_chunk_array;
2326 rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout);
2329 hl_fence_put(sig_fence);
2331 goto free_cs_chunk_array;
2335 * Save the signal CS fence for later initialization right before
2336 * hanging the wait CS on the queue.
2337 * for encaps signals case, we save the cs sequence and handle pointer
2338 * for later initialization.
2341 cs->signal_fence = sig_fence;
2342 /* store the handle pointer, so we don't have to
2343 * look for it again, later on the flow
2344 * when we need to set SOB info in hw_queue.
2346 if (cs->encaps_signals)
2347 cs->encaps_sig_hdl = encaps_sig_hdl;
2350 hl_debugfs_add_cs(cs);
2352 *cs_seq = cs->sequence;
2354 if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
2355 rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
2356 q_idx, chunk->encaps_signal_offset);
2357 else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
2358 rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
2359 cs, q_idx, collective_engine_id,
2360 chunk->encaps_signal_offset);
2362 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2363 atomic64_inc(&cntr->validation_drop_cnt);
2368 goto free_cs_object;
2370 if (q_type == QUEUE_TYPE_HW)
2371 INIT_WORK(&cs->finish_work, cs_completion);
2373 rc = hl_hw_queue_schedule_cs(cs);
2375 /* In case wait cs failed here, it means the signal cs
2376 * already completed. we want to free all it's related objects
2377 * but we don't want to fail the ioctl.
2381 else if (rc != -EAGAIN)
2383 "Failed to submit CS %d.%llu to H/W queues, error %d\n",
2384 ctx->asid, cs->sequence, rc);
2385 goto free_cs_object;
2388 *signal_sob_addr_offset = cs->sob_addr_offset;
2389 *signal_initial_sob_count = cs->initial_sob_count;
2391 rc = HL_CS_STATUS_SUCCESS;
2393 wait_cs_submitted = true;
2397 cs_rollback(hdev, cs);
2398 *cs_seq = ULLONG_MAX;
2399 /* The path below is both for good and erroneous exits */
2401 /* We finished with the CS in this function, so put the ref */
2403 free_cs_chunk_array:
2404 if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
2405 kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
2406 kfree(cs_chunk_array);
2411 static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
2412 u32 num_engine_cores, u32 core_command)
2415 struct hl_device *hdev = hpriv->hdev;
2416 void __user *engine_cores_arr;
2419 if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) {
2420 dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
2424 if (core_command != HL_ENGINE_CORE_RUN && core_command != HL_ENGINE_CORE_HALT) {
2425 dev_err(hdev->dev, "Engine core command is invalid\n");
2429 engine_cores_arr = (void __user *) (uintptr_t) engine_cores;
2430 cores = kmalloc_array(num_engine_cores, sizeof(u32), GFP_KERNEL);
2434 if (copy_from_user(cores, engine_cores_arr, num_engine_cores * sizeof(u32))) {
2435 dev_err(hdev->dev, "Failed to copy core-ids array from user\n");
2440 rc = hdev->asic_funcs->set_engine_cores(hdev, cores, num_engine_cores, core_command);
2446 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
2448 union hl_cs_args *args = data;
2449 enum hl_cs_type cs_type = 0;
2450 u64 cs_seq = ULONG_MAX;
2451 void __user *chunks;
2452 u32 num_chunks, flags, timeout,
2453 signals_count = 0, sob_addr = 0, handle_id = 0;
2454 u16 sob_initial_count = 0;
2457 rc = hl_cs_sanity_checks(hpriv, args);
2461 rc = hl_cs_ctx_switch(hpriv, args, &cs_seq);
2465 cs_type = hl_cs_get_cs_type(args->in.cs_flags &
2466 ~HL_CS_FLAGS_FORCE_RESTORE);
2467 chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
2468 num_chunks = args->in.num_chunks_execute;
2469 flags = args->in.cs_flags;
2471 /* In case this is a staged CS, user should supply the CS sequence */
2472 if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
2473 !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
2474 cs_seq = args->in.seq;
2476 timeout = flags & HL_CS_FLAGS_CUSTOM_TIMEOUT
2477 ? msecs_to_jiffies(args->in.timeout * 1000)
2478 : hpriv->hdev->timeout_jiffies;
2481 case CS_TYPE_SIGNAL:
2483 case CS_TYPE_COLLECTIVE_WAIT:
2484 rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
2485 &cs_seq, args->in.cs_flags, timeout,
2486 &sob_addr, &sob_initial_count);
2488 case CS_RESERVE_SIGNALS:
2489 rc = cs_ioctl_reserve_signals(hpriv,
2490 args->in.encaps_signals_q_idx,
2491 args->in.encaps_signals_count,
2492 &handle_id, &sob_addr, &signals_count);
2494 case CS_UNRESERVE_SIGNALS:
2495 rc = cs_ioctl_unreserve_signals(hpriv,
2496 args->in.encaps_sig_handle_id);
2498 case CS_TYPE_ENGINE_CORE:
2499 rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores,
2500 args->in.num_engine_cores, args->in.core_command);
2503 rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
2505 args->in.encaps_sig_handle_id,
2506 timeout, &sob_initial_count);
2510 if (rc != -EAGAIN) {
2511 memset(args, 0, sizeof(*args));
2514 case CS_RESERVE_SIGNALS:
2515 args->out.handle_id = handle_id;
2516 args->out.sob_base_addr_offset = sob_addr;
2517 args->out.count = signals_count;
2519 case CS_TYPE_SIGNAL:
2520 args->out.sob_base_addr_offset = sob_addr;
2521 args->out.sob_count_before_submission = sob_initial_count;
2522 args->out.seq = cs_seq;
2524 case CS_TYPE_DEFAULT:
2525 args->out.sob_count_before_submission = sob_initial_count;
2526 args->out.seq = cs_seq;
2529 args->out.seq = cs_seq;
2533 args->out.status = rc;
2539 static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence,
2540 enum hl_cs_wait_status *status, u64 timeout_us, s64 *timestamp)
2542 struct hl_device *hdev = ctx->hdev;
2543 ktime_t timestamp_kt;
2547 if (IS_ERR(fence)) {
2548 rc = PTR_ERR(fence);
2550 dev_notice_ratelimited(hdev->dev,
2551 "Can't wait on CS %llu because current CS is at seq %llu\n",
2552 seq, ctx->cs_sequence);
2557 if (!hl_pop_cs_outcome(&ctx->outcome_store, seq, ×tamp_kt, &error)) {
2559 "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
2560 seq, ctx->cs_sequence);
2561 *status = CS_WAIT_STATUS_GONE;
2566 goto report_results;
2570 completion_rc = completion_done(&fence->completion);
2572 unsigned long timeout;
2574 timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
2575 timeout_us : usecs_to_jiffies(timeout_us);
2577 wait_for_completion_interruptible_timeout(
2578 &fence->completion, timeout);
2581 error = fence->error;
2582 timestamp_kt = fence->timestamp;
2585 if (completion_rc > 0) {
2586 *status = CS_WAIT_STATUS_COMPLETED;
2588 *timestamp = ktime_to_ns(timestamp_kt);
2590 *status = CS_WAIT_STATUS_BUSY;
2593 if (completion_rc == -ERESTARTSYS)
2595 else if (error == -ETIMEDOUT || error == -EIO)
2602 * hl_cs_poll_fences - iterate CS fences to check for CS completion
2604 * @mcs_data: multi-CS internal data
2605 * @mcs_compl: multi-CS completion structure
2607 * @return 0 on success, otherwise non 0 error code
2609 * The function iterates on all CS sequence in the list and set bit in
2610 * completion_bitmap for each completed CS.
2611 * While iterating, the function sets the stream map of each fence in the fence
2612 * array in the completion QID stream map to be used by CSs to perform
2613 * completion to the multi-CS context.
2614 * This function shall be called after taking context ref
2616 static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl)
2618 struct hl_fence **fence_ptr = mcs_data->fence_arr;
2619 struct hl_device *hdev = mcs_data->ctx->hdev;
2620 int i, rc, arr_len = mcs_data->arr_len;
2621 u64 *seq_arr = mcs_data->seq_arr;
2622 ktime_t max_ktime, first_cs_time;
2623 enum hl_cs_wait_status status;
2625 memset(fence_ptr, 0, arr_len * sizeof(struct hl_fence *));
2627 /* get all fences under the same lock */
2628 rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
2633 * re-initialize the completion here to handle 2 possible cases:
2634 * 1. CS will complete the multi-CS prior clearing the completion. in which
2635 * case the fence iteration is guaranteed to catch the CS completion.
2636 * 2. the completion will occur after re-init of the completion.
2637 * in which case we will wake up immediately in wait_for_completion.
2639 reinit_completion(&mcs_compl->completion);
2642 * set to maximum time to verify timestamp is valid: if at the end
2643 * this value is maintained- no timestamp was updated
2645 max_ktime = ktime_set(KTIME_SEC_MAX, 0);
2646 first_cs_time = max_ktime;
2648 for (i = 0; i < arr_len; i++, fence_ptr++) {
2649 struct hl_fence *fence = *fence_ptr;
2652 * In order to prevent case where we wait until timeout even though a CS associated
2653 * with the multi-CS actually completed we do things in the below order:
2654 * 1. for each fence set it's QID map in the multi-CS completion QID map. This way
2655 * any CS can, potentially, complete the multi CS for the specific QID (note
2656 * that once completion is initialized, calling complete* and then wait on the
2657 * completion will cause it to return at once)
2658 * 2. only after allowing multi-CS completion for the specific QID we check whether
2659 * the specific CS already completed (and thus the wait for completion part will
2660 * be skipped). if the CS not completed it is guaranteed that completing CS will
2661 * wake up the completion.
2664 mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map;
2667 * function won't sleep as it is called with timeout 0 (i.e.
2670 rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence, &status, 0, NULL);
2673 "wait_for_fence error :%d for CS seq %llu\n",
2679 case CS_WAIT_STATUS_BUSY:
2680 /* CS did not finished, QID to wait on already stored */
2682 case CS_WAIT_STATUS_COMPLETED:
2684 * Using mcs_handling_done to avoid possibility of mcs_data
2685 * returns to user indicating CS completed before it finished
2686 * all of its mcs handling, to avoid race the next time the
2687 * user waits for mcs.
2688 * note: when reaching this case fence is definitely not NULL
2689 * but NULL check was added to overcome static analysis
2691 if (fence && !fence->mcs_handling_done) {
2693 * in case multi CS is completed but MCS handling not done
2694 * we "complete" the multi CS to prevent it from waiting
2695 * until time-out and the "multi-CS handling done" will have
2696 * another chance at the next iteration
2698 complete_all(&mcs_compl->completion);
2702 mcs_data->completion_bitmap |= BIT(i);
2704 * For all completed CSs we take the earliest timestamp.
2705 * For this we have to validate that the timestamp is
2706 * earliest of all timestamps so far.
2708 if (fence && mcs_data->update_ts &&
2709 (ktime_compare(fence->timestamp, first_cs_time) < 0))
2710 first_cs_time = fence->timestamp;
2712 case CS_WAIT_STATUS_GONE:
2713 mcs_data->update_ts = false;
2714 mcs_data->gone_cs = true;
2716 * It is possible to get an old sequence numbers from user
2717 * which related to already completed CSs and their fences
2718 * already gone. In this case, CS set as completed but
2719 * no need to consider its QID for mcs completion.
2721 mcs_data->completion_bitmap |= BIT(i);
2724 dev_err(hdev->dev, "Invalid fence status\n");
2731 hl_fences_put(mcs_data->fence_arr, arr_len);
2733 if (mcs_data->update_ts &&
2734 (ktime_compare(first_cs_time, max_ktime) != 0))
2735 mcs_data->timestamp = ktime_to_ns(first_cs_time);
2740 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq,
2741 enum hl_cs_wait_status *status, s64 *timestamp)
2743 struct hl_fence *fence;
2751 fence = hl_ctx_get_fence(ctx, seq);
2753 rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
2754 hl_fence_put(fence);
2760 static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
2762 if (usecs <= U32_MAX)
2763 return usecs_to_jiffies(usecs);
2766 * If the value in nanoseconds is larger than 64 bit, use the largest
2769 if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
2770 return nsecs_to_jiffies(U64_MAX);
2772 return nsecs_to_jiffies(usecs * NSEC_PER_USEC);
2776 * hl_wait_multi_cs_completion_init - init completion structure
2778 * @hdev: pointer to habanalabs device structure
2779 * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
2780 * master QID to wait on
2782 * @return valid completion struct pointer on success, otherwise error pointer
2784 * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
2785 * the function gets the first available completion (by marking it "used")
2786 * and initialize its values.
2788 static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev)
2790 struct multi_cs_completion *mcs_compl;
2793 /* find free multi_cs completion structure */
2794 for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2795 mcs_compl = &hdev->multi_cs_completion[i];
2796 spin_lock(&mcs_compl->lock);
2797 if (!mcs_compl->used) {
2798 mcs_compl->used = 1;
2799 mcs_compl->timestamp = 0;
2801 * init QID map to 0 to avoid completion by CSs. the actual QID map
2802 * to multi-CS CSs will be set incrementally at a later stage
2804 mcs_compl->stream_master_qid_map = 0;
2805 spin_unlock(&mcs_compl->lock);
2808 spin_unlock(&mcs_compl->lock);
2811 if (i == MULTI_CS_MAX_USER_CTX) {
2812 dev_err(hdev->dev, "no available multi-CS completion structure\n");
2813 return ERR_PTR(-ENOMEM);
2819 * hl_wait_multi_cs_completion_fini - return completion structure and set as
2822 * @mcs_compl: pointer to the completion structure
2824 static void hl_wait_multi_cs_completion_fini(
2825 struct multi_cs_completion *mcs_compl)
2828 * free completion structure, do it under lock to be in-sync with the
2829 * thread that signals completion
2831 spin_lock(&mcs_compl->lock);
2832 mcs_compl->used = 0;
2833 spin_unlock(&mcs_compl->lock);
2837 * hl_wait_multi_cs_completion - wait for first CS to complete
2839 * @mcs_data: multi-CS internal data
2841 * @return 0 on success, otherwise non 0 error code
2843 static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data,
2844 struct multi_cs_completion *mcs_compl)
2848 completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion,
2849 mcs_data->timeout_jiffies);
2851 /* update timestamp */
2852 if (completion_rc > 0)
2853 mcs_data->timestamp = mcs_compl->timestamp;
2855 if (completion_rc == -ERESTARTSYS)
2856 return completion_rc;
2858 mcs_data->wait_status = completion_rc;
2864 * hl_multi_cs_completion_init - init array of multi-CS completion structures
2866 * @hdev: pointer to habanalabs device structure
2868 void hl_multi_cs_completion_init(struct hl_device *hdev)
2870 struct multi_cs_completion *mcs_cmpl;
2873 for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2874 mcs_cmpl = &hdev->multi_cs_completion[i];
2876 spin_lock_init(&mcs_cmpl->lock);
2877 init_completion(&mcs_cmpl->completion);
2882 * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
2884 * @hpriv: pointer to the private data of the fd
2885 * @data: pointer to multi-CS wait ioctl in/out args
2888 static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
2890 struct multi_cs_completion *mcs_compl;
2891 struct hl_device *hdev = hpriv->hdev;
2892 struct multi_cs_data mcs_data = {};
2893 union hl_wait_cs_args *args = data;
2894 struct hl_ctx *ctx = hpriv->ctx;
2895 struct hl_fence **fence_arr;
2896 void __user *seq_arr;
2902 if (!hdev->supports_wait_for_multi_cs) {
2903 dev_err(hdev->dev, "Wait for multi CS is not supported\n");
2907 seq_arr_len = args->in.seq_arr_len;
2909 if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
2910 dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
2911 HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
2915 /* allocate memory for sequence array */
2917 kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
2921 /* copy CS sequence array from user */
2922 seq_arr = (void __user *) (uintptr_t) args->in.seq;
2923 size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
2924 if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
2925 dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
2930 /* allocate array for the fences */
2931 fence_arr = kmalloc_array(seq_arr_len, sizeof(struct hl_fence *), GFP_KERNEL);
2937 /* initialize the multi-CS internal data */
2939 mcs_data.seq_arr = cs_seq_arr;
2940 mcs_data.fence_arr = fence_arr;
2941 mcs_data.arr_len = seq_arr_len;
2945 /* wait (with timeout) for the first CS to be completed */
2946 mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us);
2947 mcs_compl = hl_wait_multi_cs_completion_init(hdev);
2948 if (IS_ERR(mcs_compl)) {
2949 rc = PTR_ERR(mcs_compl);
2953 /* poll all CS fences, extract timestamp */
2954 mcs_data.update_ts = true;
2955 rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
2957 * skip wait for CS completion when one of the below is true:
2958 * - an error on the poll function
2959 * - one or more CS in the list completed
2960 * - the user called ioctl with timeout 0
2962 if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
2963 goto completion_fini;
2966 rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl);
2967 if (rc || (mcs_data.wait_status == 0))
2971 * poll fences once again to update the CS map.
2972 * no timestamp should be updated this time.
2974 mcs_data.update_ts = false;
2975 rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
2977 if (rc || mcs_data.completion_bitmap)
2981 * if hl_wait_multi_cs_completion returned before timeout (i.e.
2982 * it got a completion) it either got completed by CS in the multi CS list
2983 * (in which case the indication will be non empty completion_bitmap) or it
2984 * got completed by CS submitted to one of the shared stream master but
2985 * not in the multi CS list (in which case we should wait again but modify
2986 * the timeout and set timestamp as zero to let a CS related to the current
2987 * multi-CS set a new, relevant, timestamp)
2989 mcs_data.timeout_jiffies = mcs_data.wait_status;
2990 mcs_compl->timestamp = 0;
2994 hl_wait_multi_cs_completion_fini(mcs_compl);
3003 if (rc == -ERESTARTSYS) {
3004 dev_err_ratelimited(hdev->dev,
3005 "user process got signal while waiting for Multi-CS\n");
3012 /* update output args */
3013 memset(args, 0, sizeof(*args));
3015 if (mcs_data.completion_bitmap) {
3016 args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3017 args->out.cs_completion_map = mcs_data.completion_bitmap;
3019 /* if timestamp not 0- it's valid */
3020 if (mcs_data.timestamp) {
3021 args->out.timestamp_nsec = mcs_data.timestamp;
3022 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3025 /* update if some CS was gone */
3026 if (!mcs_data.timestamp)
3027 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
3029 args->out.status = HL_WAIT_CS_STATUS_BUSY;
3035 static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3037 struct hl_device *hdev = hpriv->hdev;
3038 union hl_wait_cs_args *args = data;
3039 enum hl_cs_wait_status status;
3040 u64 seq = args->in.seq;
3044 rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq, &status, ×tamp);
3046 if (rc == -ERESTARTSYS) {
3047 dev_err_ratelimited(hdev->dev,
3048 "user process got signal while waiting for CS handle %llu\n",
3053 memset(args, 0, sizeof(*args));
3056 if (rc == -ETIMEDOUT) {
3057 dev_err_ratelimited(hdev->dev,
3058 "CS %llu has timed-out while user process is waiting for it\n",
3060 args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
3061 } else if (rc == -EIO) {
3062 dev_err_ratelimited(hdev->dev,
3063 "CS %llu has been aborted while user process is waiting for it\n",
3065 args->out.status = HL_WAIT_CS_STATUS_ABORTED;
3071 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3072 args->out.timestamp_nsec = timestamp;
3076 case CS_WAIT_STATUS_GONE:
3077 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
3079 case CS_WAIT_STATUS_COMPLETED:
3080 args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3082 case CS_WAIT_STATUS_BUSY:
3084 args->out.status = HL_WAIT_CS_STATUS_BUSY;
3091 static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
3092 struct hl_cb *cq_cb,
3093 u64 ts_offset, u64 cq_offset, u64 target_value,
3094 spinlock_t *wait_list_lock,
3095 struct hl_user_pending_interrupt **pend)
3097 struct hl_ts_buff *ts_buff = buf->private;
3098 struct hl_user_pending_interrupt *requested_offset_record =
3099 (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3101 struct hl_user_pending_interrupt *cb_last =
3102 (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3103 (ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
3104 unsigned long flags, iter_counter = 0;
3105 u64 current_cq_counter;
3107 /* Validate ts_offset not exceeding last max */
3108 if (requested_offset_record >= cb_last) {
3109 dev_err(buf->mmg->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
3110 (u64)(uintptr_t)cb_last);
3115 spin_lock_irqsave(wait_list_lock, flags);
3117 /* Unregister only if we didn't reach the target value
3118 * since in this case there will be no handling in irq context
3119 * and then it's safe to delete the node out of the interrupt list
3120 * then re-use it on other interrupt
3122 if (requested_offset_record->ts_reg_info.in_use) {
3123 current_cq_counter = *requested_offset_record->cq_kernel_addr;
3124 if (current_cq_counter < requested_offset_record->cq_target_value) {
3125 list_del(&requested_offset_record->wait_list_node);
3126 spin_unlock_irqrestore(wait_list_lock, flags);
3128 hl_mmap_mem_buf_put(requested_offset_record->ts_reg_info.buf);
3129 hl_cb_put(requested_offset_record->ts_reg_info.cq_cb);
3131 dev_dbg(buf->mmg->dev,
3132 "ts node removed from interrupt list now can re-use\n");
3134 dev_dbg(buf->mmg->dev,
3135 "ts node in middle of irq handling\n");
3137 /* irq handling in the middle give it time to finish */
3138 spin_unlock_irqrestore(wait_list_lock, flags);
3139 usleep_range(1, 10);
3140 if (++iter_counter == MAX_TS_ITER_NUM) {
3141 dev_err(buf->mmg->dev,
3142 "handling registration interrupt took too long!!\n");
3149 spin_unlock_irqrestore(wait_list_lock, flags);
3152 /* Fill up the new registration node info */
3153 requested_offset_record->ts_reg_info.in_use = 1;
3154 requested_offset_record->ts_reg_info.buf = buf;
3155 requested_offset_record->ts_reg_info.cq_cb = cq_cb;
3156 requested_offset_record->ts_reg_info.timestamp_kernel_addr =
3157 (u64 *) ts_buff->user_buff_address + ts_offset;
3158 requested_offset_record->cq_kernel_addr =
3159 (u64 *) cq_cb->kernel_address + cq_offset;
3160 requested_offset_record->cq_target_value = target_value;
3162 *pend = requested_offset_record;
3164 dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB %p\n",
3165 requested_offset_record);
3169 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
3170 struct hl_mem_mgr *cb_mmg, struct hl_mem_mgr *mmg,
3171 u64 timeout_us, u64 cq_counters_handle, u64 cq_counters_offset,
3172 u64 target_value, struct hl_user_interrupt *interrupt,
3173 bool register_ts_record, u64 ts_handle, u64 ts_offset,
3174 u32 *status, u64 *timestamp)
3176 struct hl_user_pending_interrupt *pend;
3177 struct hl_mmap_mem_buf *buf;
3178 struct hl_cb *cq_cb;
3179 unsigned long timeout, flags;
3183 timeout = hl_usecs64_to_jiffies(timeout_us);
3187 cq_cb = hl_cb_get(cb_mmg, cq_counters_handle);
3193 /* Validate the cq offset */
3194 if (((u64 *) cq_cb->kernel_address + cq_counters_offset) >=
3195 ((u64 *) cq_cb->kernel_address + (cq_cb->size / sizeof(u64)))) {
3200 if (register_ts_record) {
3201 dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n",
3202 interrupt->interrupt_id, ts_offset, cq_counters_offset);
3203 buf = hl_mmap_mem_buf_get(mmg, ts_handle);
3209 /* Find first available record */
3210 rc = ts_buff_get_kernel_ts_record(buf, cq_cb, ts_offset,
3211 cq_counters_offset, target_value,
3212 &interrupt->wait_list_lock, &pend);
3216 pend = kzalloc(sizeof(*pend), GFP_KERNEL);
3221 hl_fence_init(&pend->fence, ULONG_MAX);
3222 pend->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_counters_offset;
3223 pend->cq_target_value = target_value;
3226 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3228 /* We check for completion value as interrupt could have been received
3229 * before we added the node to the wait list
3231 if (*pend->cq_kernel_addr >= target_value) {
3232 if (register_ts_record)
3233 pend->ts_reg_info.in_use = 0;
3234 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3236 *status = HL_WAIT_CS_STATUS_COMPLETED;
3238 if (register_ts_record) {
3239 *pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
3242 pend->fence.timestamp = ktime_get();
3245 } else if (!timeout_us) {
3246 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3247 *status = HL_WAIT_CS_STATUS_BUSY;
3248 pend->fence.timestamp = ktime_get();
3252 /* Add pending user interrupt to relevant list for the interrupt
3253 * handler to monitor.
3254 * Note that we cannot have sorted list by target value,
3255 * in order to shorten the list pass loop, since
3256 * same list could have nodes for different cq counter handle.
3258 list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
3259 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3261 if (register_ts_record) {
3262 rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
3263 goto ts_registration_exit;
3266 /* Wait for interrupt handler to signal completion */
3267 completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3269 if (completion_rc > 0) {
3270 *status = HL_WAIT_CS_STATUS_COMPLETED;
3272 if (completion_rc == -ERESTARTSYS) {
3273 dev_err_ratelimited(hdev->dev,
3274 "user process got signal while waiting for interrupt ID %d\n",
3275 interrupt->interrupt_id);
3277 *status = HL_WAIT_CS_STATUS_ABORTED;
3279 if (pend->fence.error == -EIO) {
3280 dev_err_ratelimited(hdev->dev,
3281 "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3284 *status = HL_WAIT_CS_STATUS_ABORTED;
3286 /* The wait has timed-out. We don't know anything beyond that
3287 * because the workload wasn't submitted through the driver.
3288 * Therefore, from driver's perspective, the workload is still
3292 *status = HL_WAIT_CS_STATUS_BUSY;
3298 * We keep removing the node from list here, and not at the irq handler
3299 * for completion timeout case. and if it's a registration
3300 * for ts record, the node will be deleted in the irq handler after
3301 * we reach the target value.
3303 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3304 list_del(&pend->wait_list_node);
3305 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3308 *timestamp = ktime_to_ns(pend->fence.timestamp);
3311 ts_registration_exit:
3317 hl_mmap_mem_buf_put(buf);
3326 static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx,
3327 u64 timeout_us, u64 user_address,
3328 u64 target_value, struct hl_user_interrupt *interrupt,
3332 struct hl_user_pending_interrupt *pend;
3333 unsigned long timeout, flags;
3334 u64 completion_value;
3338 timeout = hl_usecs64_to_jiffies(timeout_us);
3342 pend = kzalloc(sizeof(*pend), GFP_KERNEL);
3348 hl_fence_init(&pend->fence, ULONG_MAX);
3350 /* Add pending user interrupt to relevant list for the interrupt
3351 * handler to monitor
3353 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3354 list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
3355 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3357 /* We check for completion value as interrupt could have been received
3358 * before we added the node to the wait list
3360 if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3361 dev_err(hdev->dev, "Failed to copy completion value from user\n");
3363 goto remove_pending_user_interrupt;
3366 if (completion_value >= target_value) {
3367 *status = HL_WAIT_CS_STATUS_COMPLETED;
3368 /* There was no interrupt, we assume the completion is now. */
3369 pend->fence.timestamp = ktime_get();
3371 *status = HL_WAIT_CS_STATUS_BUSY;
3374 if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
3375 goto remove_pending_user_interrupt;
3378 /* Wait for interrupt handler to signal completion */
3379 completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3382 /* If timeout did not expire we need to perform the comparison.
3383 * If comparison fails, keep waiting until timeout expires
3385 if (completion_rc > 0) {
3386 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3387 /* reinit_completion must be called before we check for user
3388 * completion value, otherwise, if interrupt is received after
3389 * the comparison and before the next wait_for_completion,
3390 * we will reach timeout and fail
3392 reinit_completion(&pend->fence.completion);
3393 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3395 if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3396 dev_err(hdev->dev, "Failed to copy completion value from user\n");
3399 goto remove_pending_user_interrupt;
3402 if (completion_value >= target_value) {
3403 *status = HL_WAIT_CS_STATUS_COMPLETED;
3404 } else if (pend->fence.error) {
3405 dev_err_ratelimited(hdev->dev,
3406 "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3408 /* set the command completion status as ABORTED */
3409 *status = HL_WAIT_CS_STATUS_ABORTED;
3411 timeout = completion_rc;
3414 } else if (completion_rc == -ERESTARTSYS) {
3415 dev_err_ratelimited(hdev->dev,
3416 "user process got signal while waiting for interrupt ID %d\n",
3417 interrupt->interrupt_id);
3420 /* The wait has timed-out. We don't know anything beyond that
3421 * because the workload wasn't submitted through the driver.
3422 * Therefore, from driver's perspective, the workload is still
3426 *status = HL_WAIT_CS_STATUS_BUSY;
3429 remove_pending_user_interrupt:
3430 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3431 list_del(&pend->wait_list_node);
3432 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3434 *timestamp = ktime_to_ns(pend->fence.timestamp);
3442 static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3444 u16 interrupt_id, first_interrupt, last_interrupt;
3445 struct hl_device *hdev = hpriv->hdev;
3446 struct asic_fixed_properties *prop;
3447 struct hl_user_interrupt *interrupt;
3448 union hl_wait_cs_args *args = data;
3449 u32 status = HL_WAIT_CS_STATUS_BUSY;
3453 prop = &hdev->asic_prop;
3455 if (!(prop->user_interrupt_count + prop->user_dec_intr_count)) {
3456 dev_err(hdev->dev, "no user interrupts allowed");
3460 interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
3462 first_interrupt = prop->first_available_user_interrupt;
3463 last_interrupt = prop->first_available_user_interrupt + prop->user_interrupt_count - 1;
3465 if (interrupt_id < prop->user_dec_intr_count) {
3467 /* Check if the requested core is enabled */
3468 if (!(prop->decoder_enabled_mask & BIT(interrupt_id))) {
3469 dev_err(hdev->dev, "interrupt on a disabled core(%u) not allowed",
3474 interrupt = &hdev->user_interrupt[interrupt_id];
3476 } else if (interrupt_id >= first_interrupt && interrupt_id <= last_interrupt) {
3478 int_idx = interrupt_id - first_interrupt + prop->user_dec_intr_count;
3479 interrupt = &hdev->user_interrupt[int_idx];
3481 } else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
3482 interrupt = &hdev->common_user_cq_interrupt;
3483 } else if (interrupt_id == HL_COMMON_DEC_INTERRUPT_ID) {
3484 interrupt = &hdev->common_decoder_interrupt;
3486 dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
3490 if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ)
3491 rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->mem_mgr, &hpriv->mem_mgr,
3492 args->in.interrupt_timeout_us, args->in.cq_counters_handle,
3493 args->in.cq_counters_offset,
3494 args->in.target, interrupt,
3495 !!(args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT),
3496 args->in.timestamp_handle, args->in.timestamp_offset,
3497 &status, ×tamp);
3499 rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx,
3500 args->in.interrupt_timeout_us, args->in.addr,
3501 args->in.target, interrupt, &status,
3506 memset(args, 0, sizeof(*args));
3507 args->out.status = status;
3510 args->out.timestamp_nsec = timestamp;
3511 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3517 int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3519 struct hl_device *hdev = hpriv->hdev;
3520 union hl_wait_cs_args *args = data;
3521 u32 flags = args->in.flags;
3524 /* If the device is not operational, or if an error has happened and user should release the
3525 * device, there is no point in waiting for any command submission or user interrupt.
3527 if (!hl_device_operational(hpriv->hdev, NULL) || hdev->reset_info.watchdog_active)
3530 if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
3531 rc = hl_interrupt_wait_ioctl(hpriv, data);
3532 else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
3533 rc = hl_multi_cs_wait_ioctl(hpriv, data);
3535 rc = hl_cs_wait_ioctl(hpriv, data);