f6ee10334235aea4c400c29f1879660279227335
[linux-2.6-microblaze.git] / drivers / misc / habanalabs / common / command_submission.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  */
7
8 #include <uapi/drm/habanalabs_accel.h>
9 #include "habanalabs.h"
10
11 #include <linux/uaccess.h>
12 #include <linux/slab.h>
13
14 #define HL_CS_FLAGS_TYPE_MASK   (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
15                         HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \
16                         HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND)
17
18
19 #define MAX_TS_ITER_NUM 10
20
21 /**
22  * enum hl_cs_wait_status - cs wait status
23  * @CS_WAIT_STATUS_BUSY: cs was not completed yet
24  * @CS_WAIT_STATUS_COMPLETED: cs completed
25  * @CS_WAIT_STATUS_GONE: cs completed but fence is already gone
26  */
27 enum hl_cs_wait_status {
28         CS_WAIT_STATUS_BUSY,
29         CS_WAIT_STATUS_COMPLETED,
30         CS_WAIT_STATUS_GONE
31 };
32
33 static void job_wq_completion(struct work_struct *work);
34 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq,
35                                 enum hl_cs_wait_status *status, s64 *timestamp);
36 static void cs_do_release(struct kref *ref);
37
38 static void hl_push_cs_outcome(struct hl_device *hdev,
39                                struct hl_cs_outcome_store *outcome_store,
40                                u64 seq, ktime_t ts, int error)
41 {
42         struct hl_cs_outcome *node;
43         unsigned long flags;
44
45         /*
46          * CS outcome store supports the following operations:
47          * push outcome - store a recent CS outcome in the store
48          * pop outcome - retrieve a SPECIFIC (by seq) CS outcome from the store
49          * It uses 2 lists: used list and free list.
50          * It has a pre-allocated amount of nodes, each node stores
51          * a single CS outcome.
52          * Initially, all the nodes are in the free list.
53          * On push outcome, a node (any) is taken from the free list, its
54          * information is filled in, and the node is moved to the used list.
55          * It is possible, that there are no nodes left in the free list.
56          * In this case, we will lose some information about old outcomes. We
57          * will pop the OLDEST node from the used list, and make it free.
58          * On pop, the node is searched for in the used list (using a search
59          * index).
60          * If found, the node is then removed from the used list, and moved
61          * back to the free list. The outcome data that the node contained is
62          * returned back to the user.
63          */
64
65         spin_lock_irqsave(&outcome_store->db_lock, flags);
66
67         if (list_empty(&outcome_store->free_list)) {
68                 node = list_last_entry(&outcome_store->used_list,
69                                        struct hl_cs_outcome, list_link);
70                 hash_del(&node->map_link);
71                 dev_dbg(hdev->dev, "CS %llu outcome was lost\n", node->seq);
72         } else {
73                 node = list_last_entry(&outcome_store->free_list,
74                                        struct hl_cs_outcome, list_link);
75         }
76
77         list_del_init(&node->list_link);
78
79         node->seq = seq;
80         node->ts = ts;
81         node->error = error;
82
83         list_add(&node->list_link, &outcome_store->used_list);
84         hash_add(outcome_store->outcome_map, &node->map_link, node->seq);
85
86         spin_unlock_irqrestore(&outcome_store->db_lock, flags);
87 }
88
89 static bool hl_pop_cs_outcome(struct hl_cs_outcome_store *outcome_store,
90                                u64 seq, ktime_t *ts, int *error)
91 {
92         struct hl_cs_outcome *node;
93         unsigned long flags;
94
95         spin_lock_irqsave(&outcome_store->db_lock, flags);
96
97         hash_for_each_possible(outcome_store->outcome_map, node, map_link, seq)
98                 if (node->seq == seq) {
99                         *ts = node->ts;
100                         *error = node->error;
101
102                         hash_del(&node->map_link);
103                         list_del_init(&node->list_link);
104                         list_add(&node->list_link, &outcome_store->free_list);
105
106                         spin_unlock_irqrestore(&outcome_store->db_lock, flags);
107
108                         return true;
109                 }
110
111         spin_unlock_irqrestore(&outcome_store->db_lock, flags);
112
113         return false;
114 }
115
116 static void hl_sob_reset(struct kref *ref)
117 {
118         struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
119                                                         kref);
120         struct hl_device *hdev = hw_sob->hdev;
121
122         dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
123
124         hdev->asic_funcs->reset_sob(hdev, hw_sob);
125
126         hw_sob->need_reset = false;
127 }
128
129 void hl_sob_reset_error(struct kref *ref)
130 {
131         struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
132                                                         kref);
133         struct hl_device *hdev = hw_sob->hdev;
134
135         dev_crit(hdev->dev,
136                 "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
137                 hw_sob->q_idx, hw_sob->sob_id);
138 }
139
140 void hw_sob_put(struct hl_hw_sob *hw_sob)
141 {
142         if (hw_sob)
143                 kref_put(&hw_sob->kref, hl_sob_reset);
144 }
145
146 static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
147 {
148         if (hw_sob)
149                 kref_put(&hw_sob->kref, hl_sob_reset_error);
150 }
151
152 void hw_sob_get(struct hl_hw_sob *hw_sob)
153 {
154         if (hw_sob)
155                 kref_get(&hw_sob->kref);
156 }
157
158 /**
159  * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
160  * @sob_base: sob base id
161  * @sob_mask: sob user mask, each bit represents a sob offset from sob base
162  * @mask: generated mask
163  *
164  * Return: 0 if given parameters are valid
165  */
166 int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
167 {
168         int i;
169
170         if (sob_mask == 0)
171                 return -EINVAL;
172
173         if (sob_mask == 0x1) {
174                 *mask = ~(1 << (sob_base & 0x7));
175         } else {
176                 /* find msb in order to verify sob range is valid */
177                 for (i = BITS_PER_BYTE - 1 ; i >= 0 ; i--)
178                         if (BIT(i) & sob_mask)
179                                 break;
180
181                 if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & 0x7) - 1))
182                         return -EINVAL;
183
184                 *mask = ~sob_mask;
185         }
186
187         return 0;
188 }
189
190 static void hl_fence_release(struct kref *kref)
191 {
192         struct hl_fence *fence =
193                 container_of(kref, struct hl_fence, refcount);
194         struct hl_cs_compl *hl_cs_cmpl =
195                 container_of(fence, struct hl_cs_compl, base_fence);
196
197         kfree(hl_cs_cmpl);
198 }
199
200 void hl_fence_put(struct hl_fence *fence)
201 {
202         if (IS_ERR_OR_NULL(fence))
203                 return;
204         kref_put(&fence->refcount, hl_fence_release);
205 }
206
207 void hl_fences_put(struct hl_fence **fence, int len)
208 {
209         int i;
210
211         for (i = 0; i < len; i++, fence++)
212                 hl_fence_put(*fence);
213 }
214
215 void hl_fence_get(struct hl_fence *fence)
216 {
217         if (fence)
218                 kref_get(&fence->refcount);
219 }
220
221 static void hl_fence_init(struct hl_fence *fence, u64 sequence)
222 {
223         kref_init(&fence->refcount);
224         fence->cs_sequence = sequence;
225         fence->error = 0;
226         fence->timestamp = ktime_set(0, 0);
227         fence->mcs_handling_done = false;
228         init_completion(&fence->completion);
229 }
230
231 void cs_get(struct hl_cs *cs)
232 {
233         kref_get(&cs->refcount);
234 }
235
236 static int cs_get_unless_zero(struct hl_cs *cs)
237 {
238         return kref_get_unless_zero(&cs->refcount);
239 }
240
241 static void cs_put(struct hl_cs *cs)
242 {
243         kref_put(&cs->refcount, cs_do_release);
244 }
245
246 static void cs_job_do_release(struct kref *ref)
247 {
248         struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
249
250         kfree(job);
251 }
252
253 static void hl_cs_job_put(struct hl_cs_job *job)
254 {
255         kref_put(&job->refcount, cs_job_do_release);
256 }
257
258 bool cs_needs_completion(struct hl_cs *cs)
259 {
260         /* In case this is a staged CS, only the last CS in sequence should
261          * get a completion, any non staged CS will always get a completion
262          */
263         if (cs->staged_cs && !cs->staged_last)
264                 return false;
265
266         return true;
267 }
268
269 bool cs_needs_timeout(struct hl_cs *cs)
270 {
271         /* In case this is a staged CS, only the first CS in sequence should
272          * get a timeout, any non staged CS will always get a timeout
273          */
274         if (cs->staged_cs && !cs->staged_first)
275                 return false;
276
277         return true;
278 }
279
280 static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
281 {
282         /*
283          * Patched CB is created for external queues jobs, and for H/W queues
284          * jobs if the user CB was allocated by driver and MMU is disabled.
285          */
286         return (job->queue_type == QUEUE_TYPE_EXT ||
287                         (job->queue_type == QUEUE_TYPE_HW &&
288                                         job->is_kernel_allocated_cb &&
289                                         !hdev->mmu_enable));
290 }
291
292 /*
293  * cs_parser - parse the user command submission
294  *
295  * @hpriv       : pointer to the private data of the fd
296  * @job        : pointer to the job that holds the command submission info
297  *
298  * The function parses the command submission of the user. It calls the
299  * ASIC specific parser, which returns a list of memory blocks to send
300  * to the device as different command buffers
301  *
302  */
303 static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
304 {
305         struct hl_device *hdev = hpriv->hdev;
306         struct hl_cs_parser parser;
307         int rc;
308
309         parser.ctx_id = job->cs->ctx->asid;
310         parser.cs_sequence = job->cs->sequence;
311         parser.job_id = job->id;
312
313         parser.hw_queue_id = job->hw_queue_id;
314         parser.job_userptr_list = &job->userptr_list;
315         parser.patched_cb = NULL;
316         parser.user_cb = job->user_cb;
317         parser.user_cb_size = job->user_cb_size;
318         parser.queue_type = job->queue_type;
319         parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
320         job->patched_cb = NULL;
321         parser.completion = cs_needs_completion(job->cs);
322
323         rc = hdev->asic_funcs->cs_parser(hdev, &parser);
324
325         if (is_cb_patched(hdev, job)) {
326                 if (!rc) {
327                         job->patched_cb = parser.patched_cb;
328                         job->job_cb_size = parser.patched_cb_size;
329                         job->contains_dma_pkt = parser.contains_dma_pkt;
330                         atomic_inc(&job->patched_cb->cs_cnt);
331                 }
332
333                 /*
334                  * Whether the parsing worked or not, we don't need the
335                  * original CB anymore because it was already parsed and
336                  * won't be accessed again for this CS
337                  */
338                 atomic_dec(&job->user_cb->cs_cnt);
339                 hl_cb_put(job->user_cb);
340                 job->user_cb = NULL;
341         } else if (!rc) {
342                 job->job_cb_size = job->user_cb_size;
343         }
344
345         return rc;
346 }
347
348 static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job)
349 {
350         struct hl_cs *cs = job->cs;
351
352         if (is_cb_patched(hdev, job)) {
353                 hl_userptr_delete_list(hdev, &job->userptr_list);
354
355                 /*
356                  * We might arrive here from rollback and patched CB wasn't
357                  * created, so we need to check it's not NULL
358                  */
359                 if (job->patched_cb) {
360                         atomic_dec(&job->patched_cb->cs_cnt);
361                         hl_cb_put(job->patched_cb);
362                 }
363         }
364
365         /* For H/W queue jobs, if a user CB was allocated by driver and MMU is
366          * enabled, the user CB isn't released in cs_parser() and thus should be
367          * released here. This is also true for INT queues jobs which were
368          * allocated by driver.
369          */
370         if ((job->is_kernel_allocated_cb &&
371                 ((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
372                                 job->queue_type == QUEUE_TYPE_INT))) {
373                 atomic_dec(&job->user_cb->cs_cnt);
374                 hl_cb_put(job->user_cb);
375         }
376
377         /*
378          * This is the only place where there can be multiple threads
379          * modifying the list at the same time
380          */
381         spin_lock(&cs->job_lock);
382         list_del(&job->cs_node);
383         spin_unlock(&cs->job_lock);
384
385         hl_debugfs_remove_job(hdev, job);
386
387         /* We decrement reference only for a CS that gets completion
388          * because the reference was incremented only for this kind of CS
389          * right before it was scheduled.
390          *
391          * In staged submission, only the last CS marked as 'staged_last'
392          * gets completion, hence its release function will be called from here.
393          * As for all the rest CS's in the staged submission which do not get
394          * completion, their CS reference will be decremented by the
395          * 'staged_last' CS during the CS release flow.
396          * All relevant PQ CI counters will be incremented during the CS release
397          * flow by calling 'hl_hw_queue_update_ci'.
398          */
399         if (cs_needs_completion(cs) &&
400                 (job->queue_type == QUEUE_TYPE_EXT || job->queue_type == QUEUE_TYPE_HW))
401                 cs_put(cs);
402
403         hl_cs_job_put(job);
404 }
405
406 /*
407  * hl_staged_cs_find_first - locate the first CS in this staged submission
408  *
409  * @hdev: pointer to device structure
410  * @cs_seq: staged submission sequence number
411  *
412  * @note: This function must be called under 'hdev->cs_mirror_lock'
413  *
414  * Find and return a CS pointer with the given sequence
415  */
416 struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq)
417 {
418         struct hl_cs *cs;
419
420         list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node)
421                 if (cs->staged_cs && cs->staged_first &&
422                                 cs->sequence == cs_seq)
423                         return cs;
424
425         return NULL;
426 }
427
428 /*
429  * is_staged_cs_last_exists - returns true if the last CS in sequence exists
430  *
431  * @hdev: pointer to device structure
432  * @cs: staged submission member
433  *
434  */
435 bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs)
436 {
437         struct hl_cs *last_entry;
438
439         last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs,
440                                                                 staged_cs_node);
441
442         if (last_entry->staged_last)
443                 return true;
444
445         return false;
446 }
447
448 /*
449  * staged_cs_get - get CS reference if this CS is a part of a staged CS
450  *
451  * @hdev: pointer to device structure
452  * @cs: current CS
453  * @cs_seq: staged submission sequence number
454  *
455  * Increment CS reference for every CS in this staged submission except for
456  * the CS which get completion.
457  */
458 static void staged_cs_get(struct hl_device *hdev, struct hl_cs *cs)
459 {
460         /* Only the last CS in this staged submission will get a completion.
461          * We must increment the reference for all other CS's in this
462          * staged submission.
463          * Once we get a completion we will release the whole staged submission.
464          */
465         if (!cs->staged_last)
466                 cs_get(cs);
467 }
468
469 /*
470  * staged_cs_put - put a CS in case it is part of staged submission
471  *
472  * @hdev: pointer to device structure
473  * @cs: CS to put
474  *
475  * This function decrements a CS reference (for a non completion CS)
476  */
477 static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs)
478 {
479         /* We release all CS's in a staged submission except the last
480          * CS which we have never incremented its reference.
481          */
482         if (!cs_needs_completion(cs))
483                 cs_put(cs);
484 }
485
486 static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
487 {
488         struct hl_cs *next = NULL, *iter, *first_cs;
489
490         if (!cs_needs_timeout(cs))
491                 return;
492
493         spin_lock(&hdev->cs_mirror_lock);
494
495         /* We need to handle tdr only once for the complete staged submission.
496          * Hence, we choose the CS that reaches this function first which is
497          * the CS marked as 'staged_last'.
498          * In case single staged cs was submitted which has both first and last
499          * indications, then "cs_find_first" below will return NULL, since we
500          * removed the cs node from the list before getting here,
501          * in such cases just continue with the cs to cancel it's TDR work.
502          */
503         if (cs->staged_cs && cs->staged_last) {
504                 first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
505                 if (first_cs)
506                         cs = first_cs;
507         }
508
509         spin_unlock(&hdev->cs_mirror_lock);
510
511         /* Don't cancel TDR in case this CS was timedout because we might be
512          * running from the TDR context
513          */
514         if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)
515                 return;
516
517         if (cs->tdr_active)
518                 cancel_delayed_work_sync(&cs->work_tdr);
519
520         spin_lock(&hdev->cs_mirror_lock);
521
522         /* queue TDR for next CS */
523         list_for_each_entry(iter, &hdev->cs_mirror_list, mirror_node)
524                 if (cs_needs_timeout(iter)) {
525                         next = iter;
526                         break;
527                 }
528
529         if (next && !next->tdr_active) {
530                 next->tdr_active = true;
531                 schedule_delayed_work(&next->work_tdr, next->timeout_jiffies);
532         }
533
534         spin_unlock(&hdev->cs_mirror_lock);
535 }
536
537 /*
538  * force_complete_multi_cs - complete all contexts that wait on multi-CS
539  *
540  * @hdev: pointer to habanalabs device structure
541  */
542 static void force_complete_multi_cs(struct hl_device *hdev)
543 {
544         int i;
545
546         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
547                 struct multi_cs_completion *mcs_compl;
548
549                 mcs_compl = &hdev->multi_cs_completion[i];
550
551                 spin_lock(&mcs_compl->lock);
552
553                 if (!mcs_compl->used) {
554                         spin_unlock(&mcs_compl->lock);
555                         continue;
556                 }
557
558                 /* when calling force complete no context should be waiting on
559                  * multi-cS.
560                  * We are calling the function as a protection for such case
561                  * to free any pending context and print error message
562                  */
563                 dev_err(hdev->dev,
564                                 "multi-CS completion context %d still waiting when calling force completion\n",
565                                 i);
566                 complete_all(&mcs_compl->completion);
567                 spin_unlock(&mcs_compl->lock);
568         }
569 }
570
571 /*
572  * complete_multi_cs - complete all waiting entities on multi-CS
573  *
574  * @hdev: pointer to habanalabs device structure
575  * @cs: CS structure
576  * The function signals a waiting entity that has an overlapping stream masters
577  * with the completed CS.
578  * For example:
579  * - a completed CS worked on stream master QID 4, multi CS completion
580  *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
581  *   common stream master QID
582  * - a completed CS worked on stream master QID 4, multi CS completion
583  *   is actively waiting on stream master QIDs 3, 4. send signal as stream
584  *   master QID 4 is common
585  */
586 static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
587 {
588         struct hl_fence *fence = cs->fence;
589         int i;
590
591         /* in case of multi CS check for completion only for the first CS */
592         if (cs->staged_cs && !cs->staged_first)
593                 return;
594
595         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
596                 struct multi_cs_completion *mcs_compl;
597
598                 mcs_compl = &hdev->multi_cs_completion[i];
599                 if (!mcs_compl->used)
600                         continue;
601
602                 spin_lock(&mcs_compl->lock);
603
604                 /*
605                  * complete if:
606                  * 1. still waiting for completion
607                  * 2. the completed CS has at least one overlapping stream
608                  *    master with the stream masters in the completion
609                  */
610                 if (mcs_compl->used &&
611                                 (fence->stream_master_qid_map &
612                                         mcs_compl->stream_master_qid_map)) {
613                         /* extract the timestamp only of first completed CS */
614                         if (!mcs_compl->timestamp)
615                                 mcs_compl->timestamp = ktime_to_ns(fence->timestamp);
616
617                         complete_all(&mcs_compl->completion);
618
619                         /*
620                          * Setting mcs_handling_done inside the lock ensures
621                          * at least one fence have mcs_handling_done set to
622                          * true before wait for mcs finish. This ensures at
623                          * least one CS will be set as completed when polling
624                          * mcs fences.
625                          */
626                         fence->mcs_handling_done = true;
627                 }
628
629                 spin_unlock(&mcs_compl->lock);
630         }
631         /* In case CS completed without mcs completion initialized */
632         fence->mcs_handling_done = true;
633 }
634
635 static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
636                                         struct hl_cs *cs,
637                                         struct hl_cs_compl *hl_cs_cmpl)
638 {
639         /* Skip this handler if the cs wasn't submitted, to avoid putting
640          * the hw_sob twice, since this case already handled at this point,
641          * also skip if the hw_sob pointer wasn't set.
642          */
643         if (!hl_cs_cmpl->hw_sob || !cs->submitted)
644                 return;
645
646         spin_lock(&hl_cs_cmpl->lock);
647
648         /*
649          * we get refcount upon reservation of signals or signal/wait cs for the
650          * hw_sob object, and need to put it when the first staged cs
651          * (which cotains the encaps signals) or cs signal/wait is completed.
652          */
653         if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
654                         (hl_cs_cmpl->type == CS_TYPE_WAIT) ||
655                         (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
656                         (!!hl_cs_cmpl->encaps_signals)) {
657                 dev_dbg(hdev->dev,
658                                 "CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
659                                 hl_cs_cmpl->cs_seq,
660                                 hl_cs_cmpl->type,
661                                 hl_cs_cmpl->hw_sob->sob_id,
662                                 hl_cs_cmpl->sob_val);
663
664                 hw_sob_put(hl_cs_cmpl->hw_sob);
665
666                 if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
667                         hdev->asic_funcs->reset_sob_group(hdev,
668                                         hl_cs_cmpl->sob_group);
669         }
670
671         spin_unlock(&hl_cs_cmpl->lock);
672 }
673
674 static void cs_do_release(struct kref *ref)
675 {
676         struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
677         struct hl_device *hdev = cs->ctx->hdev;
678         struct hl_cs_job *job, *tmp;
679         struct hl_cs_compl *hl_cs_cmpl =
680                         container_of(cs->fence, struct hl_cs_compl, base_fence);
681
682         cs->completed = true;
683
684         /*
685          * Although if we reached here it means that all external jobs have
686          * finished, because each one of them took refcnt to CS, we still
687          * need to go over the internal jobs and complete them. Otherwise, we
688          * will have leaked memory and what's worse, the CS object (and
689          * potentially the CTX object) could be released, while the JOB
690          * still holds a pointer to them (but no reference).
691          */
692         list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
693                 hl_complete_job(hdev, job);
694
695         if (!cs->submitted) {
696                 /*
697                  * In case the wait for signal CS was submitted, the fence put
698                  * occurs in init_signal_wait_cs() or collective_wait_init_cs()
699                  * right before hanging on the PQ.
700                  */
701                 if (cs->type == CS_TYPE_WAIT ||
702                                 cs->type == CS_TYPE_COLLECTIVE_WAIT)
703                         hl_fence_put(cs->signal_fence);
704
705                 goto out;
706         }
707
708         /* Need to update CI for all queue jobs that does not get completion */
709         hl_hw_queue_update_ci(cs);
710
711         /* remove CS from CS mirror list */
712         spin_lock(&hdev->cs_mirror_lock);
713         list_del_init(&cs->mirror_node);
714         spin_unlock(&hdev->cs_mirror_lock);
715
716         cs_handle_tdr(hdev, cs);
717
718         if (cs->staged_cs) {
719                 /* the completion CS decrements reference for the entire
720                  * staged submission
721                  */
722                 if (cs->staged_last) {
723                         struct hl_cs *staged_cs, *tmp_cs;
724
725                         list_for_each_entry_safe(staged_cs, tmp_cs,
726                                         &cs->staged_cs_node, staged_cs_node)
727                                 staged_cs_put(hdev, staged_cs);
728                 }
729
730                 /* A staged CS will be a member in the list only after it
731                  * was submitted. We used 'cs_mirror_lock' when inserting
732                  * it to list so we will use it again when removing it
733                  */
734                 if (cs->submitted) {
735                         spin_lock(&hdev->cs_mirror_lock);
736                         list_del(&cs->staged_cs_node);
737                         spin_unlock(&hdev->cs_mirror_lock);
738                 }
739
740                 /* decrement refcount to handle when first staged cs
741                  * with encaps signals is completed.
742                  */
743                 if (hl_cs_cmpl->encaps_signals)
744                         kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
745                                         hl_encaps_release_handle_and_put_ctx);
746         }
747
748         if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
749                 kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
750
751 out:
752         /* Must be called before hl_ctx_put because inside we use ctx to get
753          * the device
754          */
755         hl_debugfs_remove_cs(cs);
756
757         hdev->shadow_cs_queue[cs->sequence & (hdev->asic_prop.max_pending_cs - 1)] = NULL;
758
759         /* We need to mark an error for not submitted because in that case
760          * the hl fence release flow is different. Mainly, we don't need
761          * to handle hw_sob for signal/wait
762          */
763         if (cs->timedout)
764                 cs->fence->error = -ETIMEDOUT;
765         else if (cs->aborted)
766                 cs->fence->error = -EIO;
767         else if (!cs->submitted)
768                 cs->fence->error = -EBUSY;
769
770         if (unlikely(cs->skip_reset_on_timeout)) {
771                 dev_err(hdev->dev,
772                         "Command submission %llu completed after %llu (s)\n",
773                         cs->sequence,
774                         div_u64(jiffies - cs->submission_time_jiffies, HZ));
775         }
776
777         if (cs->timestamp) {
778                 cs->fence->timestamp = ktime_get();
779                 hl_push_cs_outcome(hdev, &cs->ctx->outcome_store, cs->sequence,
780                                    cs->fence->timestamp, cs->fence->error);
781         }
782
783         hl_ctx_put(cs->ctx);
784
785         complete_all(&cs->fence->completion);
786         complete_multi_cs(hdev, cs);
787
788         cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
789
790         hl_fence_put(cs->fence);
791
792         kfree(cs->jobs_in_queue_cnt);
793         kfree(cs);
794 }
795
796 static void cs_timedout(struct work_struct *work)
797 {
798         struct hl_device *hdev;
799         u64 event_mask = 0x0;
800         int rc;
801         struct hl_cs *cs = container_of(work, struct hl_cs,
802                                                  work_tdr.work);
803         bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
804
805         rc = cs_get_unless_zero(cs);
806         if (!rc)
807                 return;
808
809         if ((!cs->submitted) || (cs->completed)) {
810                 cs_put(cs);
811                 return;
812         }
813
814         hdev = cs->ctx->hdev;
815
816         if (likely(!skip_reset_on_timeout)) {
817                 if (hdev->reset_on_lockup)
818                         device_reset = true;
819                 else
820                         hdev->reset_info.needs_reset = true;
821
822                 /* Mark the CS is timed out so we won't try to cancel its TDR */
823                 cs->timedout = true;
824         }
825
826         /* Save only the first CS timeout parameters */
827         rc = atomic_cmpxchg(&hdev->captured_err_info.cs_timeout.write_enable, 1, 0);
828         if (rc) {
829                 hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
830                 hdev->captured_err_info.cs_timeout.seq = cs->sequence;
831                 event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
832         }
833
834         switch (cs->type) {
835         case CS_TYPE_SIGNAL:
836                 dev_err(hdev->dev,
837                         "Signal command submission %llu has not finished in time!\n",
838                         cs->sequence);
839                 break;
840
841         case CS_TYPE_WAIT:
842                 dev_err(hdev->dev,
843                         "Wait command submission %llu has not finished in time!\n",
844                         cs->sequence);
845                 break;
846
847         case CS_TYPE_COLLECTIVE_WAIT:
848                 dev_err(hdev->dev,
849                         "Collective Wait command submission %llu has not finished in time!\n",
850                         cs->sequence);
851                 break;
852
853         default:
854                 dev_err(hdev->dev,
855                         "Command submission %llu has not finished in time!\n",
856                         cs->sequence);
857                 break;
858         }
859
860         rc = hl_state_dump(hdev);
861         if (rc)
862                 dev_err(hdev->dev, "Error during system state dump %d\n", rc);
863
864         cs_put(cs);
865
866         if (device_reset) {
867                 event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
868                 hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
869         } else if (event_mask) {
870                 hl_notifier_event_send_all(hdev, event_mask);
871         }
872 }
873
874 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
875                         enum hl_cs_type cs_type, u64 user_sequence,
876                         struct hl_cs **cs_new, u32 flags, u32 timeout)
877 {
878         struct hl_cs_counters_atomic *cntr;
879         struct hl_fence *other = NULL;
880         struct hl_cs_compl *cs_cmpl;
881         struct hl_cs *cs;
882         int rc;
883
884         cntr = &hdev->aggregated_cs_counters;
885
886         cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
887         if (!cs)
888                 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
889
890         if (!cs) {
891                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
892                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
893                 return -ENOMEM;
894         }
895
896         /* increment refcnt for context */
897         hl_ctx_get(ctx);
898
899         cs->ctx = ctx;
900         cs->submitted = false;
901         cs->completed = false;
902         cs->type = cs_type;
903         cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
904         cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
905         cs->timeout_jiffies = timeout;
906         cs->skip_reset_on_timeout =
907                 hdev->reset_info.skip_reset_on_timeout ||
908                 !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
909         cs->submission_time_jiffies = jiffies;
910         INIT_LIST_HEAD(&cs->job_list);
911         INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
912         kref_init(&cs->refcount);
913         spin_lock_init(&cs->job_lock);
914
915         cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
916         if (!cs_cmpl)
917                 cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL);
918
919         if (!cs_cmpl) {
920                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
921                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
922                 rc = -ENOMEM;
923                 goto free_cs;
924         }
925
926         cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
927                         sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
928         if (!cs->jobs_in_queue_cnt)
929                 cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
930                                 sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
931
932         if (!cs->jobs_in_queue_cnt) {
933                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
934                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
935                 rc = -ENOMEM;
936                 goto free_cs_cmpl;
937         }
938
939         cs_cmpl->hdev = hdev;
940         cs_cmpl->type = cs->type;
941         spin_lock_init(&cs_cmpl->lock);
942         cs->fence = &cs_cmpl->base_fence;
943
944         spin_lock(&ctx->cs_lock);
945
946         cs_cmpl->cs_seq = ctx->cs_sequence;
947         other = ctx->cs_pending[cs_cmpl->cs_seq &
948                                 (hdev->asic_prop.max_pending_cs - 1)];
949
950         if (other && !completion_done(&other->completion)) {
951                 /* If the following statement is true, it means we have reached
952                  * a point in which only part of the staged submission was
953                  * submitted and we don't have enough room in the 'cs_pending'
954                  * array for the rest of the submission.
955                  * This causes a deadlock because this CS will never be
956                  * completed as it depends on future CS's for completion.
957                  */
958                 if (other->cs_sequence == user_sequence)
959                         dev_crit_ratelimited(hdev->dev,
960                                 "Staged CS %llu deadlock due to lack of resources",
961                                 user_sequence);
962
963                 dev_dbg_ratelimited(hdev->dev,
964                         "Rejecting CS because of too many in-flights CS\n");
965                 atomic64_inc(&ctx->cs_counters.max_cs_in_flight_drop_cnt);
966                 atomic64_inc(&cntr->max_cs_in_flight_drop_cnt);
967                 rc = -EAGAIN;
968                 goto free_fence;
969         }
970
971         /* init hl_fence */
972         hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq);
973
974         cs->sequence = cs_cmpl->cs_seq;
975
976         ctx->cs_pending[cs_cmpl->cs_seq &
977                         (hdev->asic_prop.max_pending_cs - 1)] =
978                                                         &cs_cmpl->base_fence;
979         ctx->cs_sequence++;
980
981         hl_fence_get(&cs_cmpl->base_fence);
982
983         hl_fence_put(other);
984
985         spin_unlock(&ctx->cs_lock);
986
987         *cs_new = cs;
988
989         return 0;
990
991 free_fence:
992         spin_unlock(&ctx->cs_lock);
993         kfree(cs->jobs_in_queue_cnt);
994 free_cs_cmpl:
995         kfree(cs_cmpl);
996 free_cs:
997         kfree(cs);
998         hl_ctx_put(ctx);
999         return rc;
1000 }
1001
1002 static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
1003 {
1004         struct hl_cs_job *job, *tmp;
1005
1006         staged_cs_put(hdev, cs);
1007
1008         list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1009                 hl_complete_job(hdev, job);
1010 }
1011
1012 /*
1013  * release_reserved_encaps_signals() - release reserved encapsulated signals.
1014  * @hdev: pointer to habanalabs device structure
1015  *
1016  * Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
1017  * encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
1018  * For these signals need also to put the refcount of the H/W SOB which was taken at the
1019  * reservation.
1020  */
1021 static void release_reserved_encaps_signals(struct hl_device *hdev)
1022 {
1023         struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
1024         struct hl_cs_encaps_sig_handle *handle;
1025         struct hl_encaps_signals_mgr *mgr;
1026         u32 id;
1027
1028         if (!ctx)
1029                 return;
1030
1031         mgr = &ctx->sig_mgr;
1032
1033         idr_for_each_entry(&mgr->handles, handle, id)
1034                 if (handle->cs_seq == ULLONG_MAX)
1035                         kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
1036
1037         hl_ctx_put(ctx);
1038 }
1039
1040 void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
1041 {
1042         int i;
1043         struct hl_cs *cs, *tmp;
1044
1045         if (!skip_wq_flush) {
1046                 flush_workqueue(hdev->ts_free_obj_wq);
1047
1048                 /* flush all completions before iterating over the CS mirror list in
1049                  * order to avoid a race with the release functions
1050                  */
1051                 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1052                         flush_workqueue(hdev->cq_wq[i]);
1053
1054                 flush_workqueue(hdev->cs_cmplt_wq);
1055         }
1056
1057         /* Make sure we don't have leftovers in the CS mirror list */
1058         list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
1059                 cs_get(cs);
1060                 cs->aborted = true;
1061                 dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
1062                                         cs->ctx->asid, cs->sequence);
1063                 cs_rollback(hdev, cs);
1064                 cs_put(cs);
1065         }
1066
1067         force_complete_multi_cs(hdev);
1068
1069         release_reserved_encaps_signals(hdev);
1070 }
1071
1072 static void
1073 wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
1074 {
1075         struct hl_user_pending_interrupt *pend, *temp;
1076         unsigned long flags;
1077
1078         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
1079         list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) {
1080                 if (pend->ts_reg_info.buf) {
1081                         list_del(&pend->wait_list_node);
1082                         hl_mmap_mem_buf_put(pend->ts_reg_info.buf);
1083                         hl_cb_put(pend->ts_reg_info.cq_cb);
1084                 } else {
1085                         pend->fence.error = -EIO;
1086                         complete_all(&pend->fence.completion);
1087                 }
1088         }
1089         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
1090 }
1091
1092 void hl_release_pending_user_interrupts(struct hl_device *hdev)
1093 {
1094         struct asic_fixed_properties *prop = &hdev->asic_prop;
1095         struct hl_user_interrupt *interrupt;
1096         int i;
1097
1098         if (!prop->user_interrupt_count)
1099                 return;
1100
1101         /* We iterate through the user interrupt requests and waking up all
1102          * user threads waiting for interrupt completion. We iterate the
1103          * list under a lock, this is why all user threads, once awake,
1104          * will wait on the same lock and will release the waiting object upon
1105          * unlock.
1106          */
1107
1108         for (i = 0 ; i < prop->user_interrupt_count ; i++) {
1109                 interrupt = &hdev->user_interrupt[i];
1110                 wake_pending_user_interrupt_threads(interrupt);
1111         }
1112
1113         interrupt = &hdev->common_user_cq_interrupt;
1114         wake_pending_user_interrupt_threads(interrupt);
1115
1116         interrupt = &hdev->common_decoder_interrupt;
1117         wake_pending_user_interrupt_threads(interrupt);
1118 }
1119
1120 static void force_complete_cs(struct hl_device *hdev)
1121 {
1122         struct hl_cs *cs;
1123
1124         spin_lock(&hdev->cs_mirror_lock);
1125
1126         list_for_each_entry(cs, &hdev->cs_mirror_list, mirror_node) {
1127                 cs->fence->error = -EIO;
1128                 complete_all(&cs->fence->completion);
1129         }
1130
1131         spin_unlock(&hdev->cs_mirror_lock);
1132 }
1133
1134 void hl_abort_waitings_for_completion(struct hl_device *hdev)
1135 {
1136         force_complete_cs(hdev);
1137         force_complete_multi_cs(hdev);
1138         hl_release_pending_user_interrupts(hdev);
1139 }
1140
1141 static void job_wq_completion(struct work_struct *work)
1142 {
1143         struct hl_cs_job *job = container_of(work, struct hl_cs_job,
1144                                                 finish_work);
1145         struct hl_cs *cs = job->cs;
1146         struct hl_device *hdev = cs->ctx->hdev;
1147
1148         /* job is no longer needed */
1149         hl_complete_job(hdev, job);
1150 }
1151
1152 static void cs_completion(struct work_struct *work)
1153 {
1154         struct hl_cs *cs = container_of(work, struct hl_cs, finish_work);
1155         struct hl_device *hdev = cs->ctx->hdev;
1156         struct hl_cs_job *job, *tmp;
1157
1158         list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1159                 hl_complete_job(hdev, job);
1160 }
1161
1162 static int validate_queue_index(struct hl_device *hdev,
1163                                 struct hl_cs_chunk *chunk,
1164                                 enum hl_queue_type *queue_type,
1165                                 bool *is_kernel_allocated_cb)
1166 {
1167         struct asic_fixed_properties *asic = &hdev->asic_prop;
1168         struct hw_queue_properties *hw_queue_prop;
1169
1170         /* This must be checked here to prevent out-of-bounds access to
1171          * hw_queues_props array
1172          */
1173         if (chunk->queue_index >= asic->max_queues) {
1174                 dev_err(hdev->dev, "Queue index %d is invalid\n",
1175                         chunk->queue_index);
1176                 return -EINVAL;
1177         }
1178
1179         hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
1180
1181         if (hw_queue_prop->type == QUEUE_TYPE_NA) {
1182                 dev_err(hdev->dev, "Queue index %d is not applicable\n",
1183                         chunk->queue_index);
1184                 return -EINVAL;
1185         }
1186
1187         if (hw_queue_prop->binned) {
1188                 dev_err(hdev->dev, "Queue index %d is binned out\n",
1189                         chunk->queue_index);
1190                 return -EINVAL;
1191         }
1192
1193         if (hw_queue_prop->driver_only) {
1194                 dev_err(hdev->dev,
1195                         "Queue index %d is restricted for the kernel driver\n",
1196                         chunk->queue_index);
1197                 return -EINVAL;
1198         }
1199
1200         /* When hw queue type isn't QUEUE_TYPE_HW,
1201          * USER_ALLOC_CB flag shall be referred as "don't care".
1202          */
1203         if (hw_queue_prop->type == QUEUE_TYPE_HW) {
1204                 if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
1205                         if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
1206                                 dev_err(hdev->dev,
1207                                         "Queue index %d doesn't support user CB\n",
1208                                         chunk->queue_index);
1209                                 return -EINVAL;
1210                         }
1211
1212                         *is_kernel_allocated_cb = false;
1213                 } else {
1214                         if (!(hw_queue_prop->cb_alloc_flags &
1215                                         CB_ALLOC_KERNEL)) {
1216                                 dev_err(hdev->dev,
1217                                         "Queue index %d doesn't support kernel CB\n",
1218                                         chunk->queue_index);
1219                                 return -EINVAL;
1220                         }
1221
1222                         *is_kernel_allocated_cb = true;
1223                 }
1224         } else {
1225                 *is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
1226                                                 & CB_ALLOC_KERNEL);
1227         }
1228
1229         *queue_type = hw_queue_prop->type;
1230         return 0;
1231 }
1232
1233 static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
1234                                         struct hl_mem_mgr *mmg,
1235                                         struct hl_cs_chunk *chunk)
1236 {
1237         struct hl_cb *cb;
1238
1239         cb = hl_cb_get(mmg, chunk->cb_handle);
1240         if (!cb) {
1241                 dev_err(hdev->dev, "CB handle 0x%llx invalid\n", chunk->cb_handle);
1242                 return NULL;
1243         }
1244
1245         if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
1246                 dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
1247                 goto release_cb;
1248         }
1249
1250         atomic_inc(&cb->cs_cnt);
1251
1252         return cb;
1253
1254 release_cb:
1255         hl_cb_put(cb);
1256         return NULL;
1257 }
1258
1259 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
1260                 enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
1261 {
1262         struct hl_cs_job *job;
1263
1264         job = kzalloc(sizeof(*job), GFP_ATOMIC);
1265         if (!job)
1266                 job = kzalloc(sizeof(*job), GFP_KERNEL);
1267
1268         if (!job)
1269                 return NULL;
1270
1271         kref_init(&job->refcount);
1272         job->queue_type = queue_type;
1273         job->is_kernel_allocated_cb = is_kernel_allocated_cb;
1274
1275         if (is_cb_patched(hdev, job))
1276                 INIT_LIST_HEAD(&job->userptr_list);
1277
1278         if (job->queue_type == QUEUE_TYPE_EXT)
1279                 INIT_WORK(&job->finish_work, job_wq_completion);
1280
1281         return job;
1282 }
1283
1284 static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
1285 {
1286         if (cs_type_flags & HL_CS_FLAGS_SIGNAL)
1287                 return CS_TYPE_SIGNAL;
1288         else if (cs_type_flags & HL_CS_FLAGS_WAIT)
1289                 return CS_TYPE_WAIT;
1290         else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
1291                 return CS_TYPE_COLLECTIVE_WAIT;
1292         else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
1293                 return CS_RESERVE_SIGNALS;
1294         else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
1295                 return CS_UNRESERVE_SIGNALS;
1296         else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
1297                 return CS_TYPE_ENGINE_CORE;
1298         else
1299                 return CS_TYPE_DEFAULT;
1300 }
1301
1302 static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
1303 {
1304         struct hl_device *hdev = hpriv->hdev;
1305         struct hl_ctx *ctx = hpriv->ctx;
1306         u32 cs_type_flags, num_chunks;
1307         enum hl_device_status status;
1308         enum hl_cs_type cs_type;
1309         bool is_sync_stream;
1310
1311         if (!hl_device_operational(hdev, &status)) {
1312                 return -EBUSY;
1313         }
1314
1315         if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1316                         !hdev->supports_staged_submission) {
1317                 dev_err(hdev->dev, "staged submission not supported");
1318                 return -EPERM;
1319         }
1320
1321         cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
1322
1323         if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
1324                 dev_err(hdev->dev,
1325                         "CS type flags are mutually exclusive, context %d\n",
1326                         ctx->asid);
1327                 return -EINVAL;
1328         }
1329
1330         cs_type = hl_cs_get_cs_type(cs_type_flags);
1331         num_chunks = args->in.num_chunks_execute;
1332
1333         is_sync_stream = (cs_type == CS_TYPE_SIGNAL || cs_type == CS_TYPE_WAIT ||
1334                         cs_type == CS_TYPE_COLLECTIVE_WAIT);
1335
1336         if (unlikely(is_sync_stream && !hdev->supports_sync_stream)) {
1337                 dev_err(hdev->dev, "Sync stream CS is not supported\n");
1338                 return -EINVAL;
1339         }
1340
1341         if (cs_type == CS_TYPE_DEFAULT) {
1342                 if (!num_chunks) {
1343                         dev_err(hdev->dev, "Got execute CS with 0 chunks, context %d\n", ctx->asid);
1344                         return -EINVAL;
1345                 }
1346         } else if (is_sync_stream && num_chunks != 1) {
1347                 dev_err(hdev->dev,
1348                         "Sync stream CS mandates one chunk only, context %d\n",
1349                         ctx->asid);
1350                 return -EINVAL;
1351         }
1352
1353         return 0;
1354 }
1355
1356 static int hl_cs_copy_chunk_array(struct hl_device *hdev,
1357                                         struct hl_cs_chunk **cs_chunk_array,
1358                                         void __user *chunks, u32 num_chunks,
1359                                         struct hl_ctx *ctx)
1360 {
1361         u32 size_to_copy;
1362
1363         if (num_chunks > HL_MAX_JOBS_PER_CS) {
1364                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1365                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1366                 dev_err(hdev->dev,
1367                         "Number of chunks can NOT be larger than %d\n",
1368                         HL_MAX_JOBS_PER_CS);
1369                 return -EINVAL;
1370         }
1371
1372         *cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array),
1373                                         GFP_ATOMIC);
1374         if (!*cs_chunk_array)
1375                 *cs_chunk_array = kmalloc_array(num_chunks,
1376                                         sizeof(**cs_chunk_array), GFP_KERNEL);
1377         if (!*cs_chunk_array) {
1378                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1379                 atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1380                 return -ENOMEM;
1381         }
1382
1383         size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
1384         if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) {
1385                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1386                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1387                 dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
1388                 kfree(*cs_chunk_array);
1389                 return -EFAULT;
1390         }
1391
1392         return 0;
1393 }
1394
1395 static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
1396                                 u64 sequence, u32 flags,
1397                                 u32 encaps_signal_handle)
1398 {
1399         if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
1400                 return 0;
1401
1402         cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST);
1403         cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST);
1404
1405         if (cs->staged_first) {
1406                 /* Staged CS sequence is the first CS sequence */
1407                 INIT_LIST_HEAD(&cs->staged_cs_node);
1408                 cs->staged_sequence = cs->sequence;
1409
1410                 if (cs->encaps_signals)
1411                         cs->encaps_sig_hdl_id = encaps_signal_handle;
1412         } else {
1413                 /* User sequence will be validated in 'hl_hw_queue_schedule_cs'
1414                  * under the cs_mirror_lock
1415                  */
1416                 cs->staged_sequence = sequence;
1417         }
1418
1419         /* Increment CS reference if needed */
1420         staged_cs_get(hdev, cs);
1421
1422         cs->staged_cs = true;
1423
1424         return 0;
1425 }
1426
1427 static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
1428 {
1429         int i;
1430
1431         for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
1432                 if (qid == hdev->stream_master_qid_arr[i])
1433                         return BIT(i);
1434
1435         return 0;
1436 }
1437
1438 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
1439                                 u32 num_chunks, u64 *cs_seq, u32 flags,
1440                                 u32 encaps_signals_handle, u32 timeout,
1441                                 u16 *signal_initial_sob_count)
1442 {
1443         bool staged_mid, int_queues_only = true, using_hw_queues = false;
1444         struct hl_device *hdev = hpriv->hdev;
1445         struct hl_cs_chunk *cs_chunk_array;
1446         struct hl_cs_counters_atomic *cntr;
1447         struct hl_ctx *ctx = hpriv->ctx;
1448         struct hl_cs_job *job;
1449         struct hl_cs *cs;
1450         struct hl_cb *cb;
1451         u64 user_sequence;
1452         u8 stream_master_qid_map = 0;
1453         int rc, i;
1454
1455         cntr = &hdev->aggregated_cs_counters;
1456         user_sequence = *cs_seq;
1457         *cs_seq = ULLONG_MAX;
1458
1459         rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
1460                         hpriv->ctx);
1461         if (rc)
1462                 goto out;
1463
1464         if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1465                         !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
1466                 staged_mid = true;
1467         else
1468                 staged_mid = false;
1469
1470         rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT,
1471                         staged_mid ? user_sequence : ULLONG_MAX, &cs, flags,
1472                         timeout);
1473         if (rc)
1474                 goto free_cs_chunk_array;
1475
1476         *cs_seq = cs->sequence;
1477
1478         hl_debugfs_add_cs(cs);
1479
1480         rc = cs_staged_submission(hdev, cs, user_sequence, flags,
1481                                                 encaps_signals_handle);
1482         if (rc)
1483                 goto free_cs_object;
1484
1485         /* If this is a staged submission we must return the staged sequence
1486          * rather than the internal CS sequence
1487          */
1488         if (cs->staged_cs)
1489                 *cs_seq = cs->staged_sequence;
1490
1491         /* Validate ALL the CS chunks before submitting the CS */
1492         for (i = 0 ; i < num_chunks ; i++) {
1493                 struct hl_cs_chunk *chunk = &cs_chunk_array[i];
1494                 enum hl_queue_type queue_type;
1495                 bool is_kernel_allocated_cb;
1496
1497                 rc = validate_queue_index(hdev, chunk, &queue_type,
1498                                                 &is_kernel_allocated_cb);
1499                 if (rc) {
1500                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1501                         atomic64_inc(&cntr->validation_drop_cnt);
1502                         goto free_cs_object;
1503                 }
1504
1505                 if (is_kernel_allocated_cb) {
1506                         cb = get_cb_from_cs_chunk(hdev, &hpriv->mem_mgr, chunk);
1507                         if (!cb) {
1508                                 atomic64_inc(
1509                                         &ctx->cs_counters.validation_drop_cnt);
1510                                 atomic64_inc(&cntr->validation_drop_cnt);
1511                                 rc = -EINVAL;
1512                                 goto free_cs_object;
1513                         }
1514                 } else {
1515                         cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
1516                 }
1517
1518                 if (queue_type == QUEUE_TYPE_EXT ||
1519                                                 queue_type == QUEUE_TYPE_HW) {
1520                         int_queues_only = false;
1521
1522                         /*
1523                          * store which stream are being used for external/HW
1524                          * queues of this CS
1525                          */
1526                         if (hdev->supports_wait_for_multi_cs)
1527                                 stream_master_qid_map |=
1528                                         get_stream_master_qid_mask(hdev,
1529                                                         chunk->queue_index);
1530                 }
1531
1532                 if (queue_type == QUEUE_TYPE_HW)
1533                         using_hw_queues = true;
1534
1535                 job = hl_cs_allocate_job(hdev, queue_type,
1536                                                 is_kernel_allocated_cb);
1537                 if (!job) {
1538                         atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1539                         atomic64_inc(&cntr->out_of_mem_drop_cnt);
1540                         dev_err(hdev->dev, "Failed to allocate a new job\n");
1541                         rc = -ENOMEM;
1542                         if (is_kernel_allocated_cb)
1543                                 goto release_cb;
1544
1545                         goto free_cs_object;
1546                 }
1547
1548                 job->id = i + 1;
1549                 job->cs = cs;
1550                 job->user_cb = cb;
1551                 job->user_cb_size = chunk->cb_size;
1552                 job->hw_queue_id = chunk->queue_index;
1553
1554                 cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1555                 cs->jobs_cnt++;
1556
1557                 list_add_tail(&job->cs_node, &cs->job_list);
1558
1559                 /*
1560                  * Increment CS reference. When CS reference is 0, CS is
1561                  * done and can be signaled to user and free all its resources
1562                  * Only increment for JOB on external or H/W queues, because
1563                  * only for those JOBs we get completion
1564                  */
1565                 if (cs_needs_completion(cs) &&
1566                         (job->queue_type == QUEUE_TYPE_EXT ||
1567                                 job->queue_type == QUEUE_TYPE_HW))
1568                         cs_get(cs);
1569
1570                 hl_debugfs_add_job(hdev, job);
1571
1572                 rc = cs_parser(hpriv, job);
1573                 if (rc) {
1574                         atomic64_inc(&ctx->cs_counters.parsing_drop_cnt);
1575                         atomic64_inc(&cntr->parsing_drop_cnt);
1576                         dev_err(hdev->dev,
1577                                 "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
1578                                 cs->ctx->asid, cs->sequence, job->id, rc);
1579                         goto free_cs_object;
1580                 }
1581         }
1582
1583         /* We allow a CS with any queue type combination as long as it does
1584          * not get a completion
1585          */
1586         if (int_queues_only && cs_needs_completion(cs)) {
1587                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1588                 atomic64_inc(&cntr->validation_drop_cnt);
1589                 dev_err(hdev->dev,
1590                         "Reject CS %d.%llu since it contains only internal queues jobs and needs completion\n",
1591                         cs->ctx->asid, cs->sequence);
1592                 rc = -EINVAL;
1593                 goto free_cs_object;
1594         }
1595
1596         if (using_hw_queues)
1597                 INIT_WORK(&cs->finish_work, cs_completion);
1598
1599         /*
1600          * store the (external/HW queues) streams used by the CS in the
1601          * fence object for multi-CS completion
1602          */
1603         if (hdev->supports_wait_for_multi_cs)
1604                 cs->fence->stream_master_qid_map = stream_master_qid_map;
1605
1606         rc = hl_hw_queue_schedule_cs(cs);
1607         if (rc) {
1608                 if (rc != -EAGAIN)
1609                         dev_err(hdev->dev,
1610                                 "Failed to submit CS %d.%llu to H/W queues, error %d\n",
1611                                 cs->ctx->asid, cs->sequence, rc);
1612                 goto free_cs_object;
1613         }
1614
1615         *signal_initial_sob_count = cs->initial_sob_count;
1616
1617         rc = HL_CS_STATUS_SUCCESS;
1618         goto put_cs;
1619
1620 release_cb:
1621         atomic_dec(&cb->cs_cnt);
1622         hl_cb_put(cb);
1623 free_cs_object:
1624         cs_rollback(hdev, cs);
1625         *cs_seq = ULLONG_MAX;
1626         /* The path below is both for good and erroneous exits */
1627 put_cs:
1628         /* We finished with the CS in this function, so put the ref */
1629         cs_put(cs);
1630 free_cs_chunk_array:
1631         kfree(cs_chunk_array);
1632 out:
1633         return rc;
1634 }
1635
1636 static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
1637                                 u64 *cs_seq)
1638 {
1639         struct hl_device *hdev = hpriv->hdev;
1640         struct hl_ctx *ctx = hpriv->ctx;
1641         bool need_soft_reset = false;
1642         int rc = 0, do_ctx_switch = 0;
1643         void __user *chunks;
1644         u32 num_chunks, tmp;
1645         u16 sob_count;
1646         int ret;
1647
1648         if (hdev->supports_ctx_switch)
1649                 do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
1650
1651         if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
1652                 mutex_lock(&hpriv->restore_phase_mutex);
1653
1654                 if (do_ctx_switch) {
1655                         rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
1656                         if (rc) {
1657                                 dev_err_ratelimited(hdev->dev,
1658                                         "Failed to switch to context %d, rejecting CS! %d\n",
1659                                         ctx->asid, rc);
1660                                 /*
1661                                  * If we timedout, or if the device is not IDLE
1662                                  * while we want to do context-switch (-EBUSY),
1663                                  * we need to soft-reset because QMAN is
1664                                  * probably stuck. However, we can't call to
1665                                  * reset here directly because of deadlock, so
1666                                  * need to do it at the very end of this
1667                                  * function
1668                                  */
1669                                 if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
1670                                         need_soft_reset = true;
1671                                 mutex_unlock(&hpriv->restore_phase_mutex);
1672                                 goto out;
1673                         }
1674                 }
1675
1676                 hdev->asic_funcs->restore_phase_topology(hdev);
1677
1678                 chunks = (void __user *) (uintptr_t) args->in.chunks_restore;
1679                 num_chunks = args->in.num_chunks_restore;
1680
1681                 if (!num_chunks) {
1682                         dev_dbg(hdev->dev,
1683                                 "Need to run restore phase but restore CS is empty\n");
1684                         rc = 0;
1685                 } else {
1686                         rc = cs_ioctl_default(hpriv, chunks, num_chunks,
1687                                         cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count);
1688                 }
1689
1690                 mutex_unlock(&hpriv->restore_phase_mutex);
1691
1692                 if (rc) {
1693                         dev_err(hdev->dev,
1694                                 "Failed to submit restore CS for context %d (%d)\n",
1695                                 ctx->asid, rc);
1696                         goto out;
1697                 }
1698
1699                 /* Need to wait for restore completion before execution phase */
1700                 if (num_chunks) {
1701                         enum hl_cs_wait_status status;
1702 wait_again:
1703                         ret = _hl_cs_wait_ioctl(hdev, ctx,
1704                                         jiffies_to_usecs(hdev->timeout_jiffies),
1705                                         *cs_seq, &status, NULL);
1706                         if (ret) {
1707                                 if (ret == -ERESTARTSYS) {
1708                                         usleep_range(100, 200);
1709                                         goto wait_again;
1710                                 }
1711
1712                                 dev_err(hdev->dev,
1713                                         "Restore CS for context %d failed to complete %d\n",
1714                                         ctx->asid, ret);
1715                                 rc = -ENOEXEC;
1716                                 goto out;
1717                         }
1718                 }
1719
1720                 if (hdev->supports_ctx_switch)
1721                         ctx->thread_ctx_switch_wait_token = 1;
1722
1723         } else if (hdev->supports_ctx_switch && !ctx->thread_ctx_switch_wait_token) {
1724                 rc = hl_poll_timeout_memory(hdev,
1725                         &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
1726                         100, jiffies_to_usecs(hdev->timeout_jiffies), false);
1727
1728                 if (rc == -ETIMEDOUT) {
1729                         dev_err(hdev->dev,
1730                                 "context switch phase timeout (%d)\n", tmp);
1731                         goto out;
1732                 }
1733         }
1734
1735 out:
1736         if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset))
1737                 hl_device_reset(hdev, 0);
1738
1739         return rc;
1740 }
1741
1742 /*
1743  * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
1744  * if the SOB value reaches the max value move to the other SOB reserved
1745  * to the queue.
1746  * @hdev: pointer to device structure
1747  * @q_idx: stream queue index
1748  * @hw_sob: the H/W SOB used in this signal CS.
1749  * @count: signals count
1750  * @encaps_sig: tells whether it's reservation for encaps signals or not.
1751  *
1752  * Note that this function must be called while hw_queues_lock is taken.
1753  */
1754 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
1755                         struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
1756
1757 {
1758         struct hl_sync_stream_properties *prop;
1759         struct hl_hw_sob *sob = *hw_sob, *other_sob;
1760         u8 other_sob_offset;
1761
1762         prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1763
1764         hw_sob_get(sob);
1765
1766         /* check for wraparound */
1767         if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
1768                 /*
1769                  * Decrement as we reached the max value.
1770                  * The release function won't be called here as we've
1771                  * just incremented the refcount right before calling this
1772                  * function.
1773                  */
1774                 hw_sob_put_err(sob);
1775
1776                 /*
1777                  * check the other sob value, if it still in use then fail
1778                  * otherwise make the switch
1779                  */
1780                 other_sob_offset = (prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
1781                 other_sob = &prop->hw_sob[other_sob_offset];
1782
1783                 if (kref_read(&other_sob->kref) != 1) {
1784                         dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n",
1785                                                                 q_idx);
1786                         return -EINVAL;
1787                 }
1788
1789                 /*
1790                  * next_sob_val always points to the next available signal
1791                  * in the sob, so in encaps signals it will be the next one
1792                  * after reserving the required amount.
1793                  */
1794                 if (encaps_sig)
1795                         prop->next_sob_val = count + 1;
1796                 else
1797                         prop->next_sob_val = count;
1798
1799                 /* only two SOBs are currently in use */
1800                 prop->curr_sob_offset = other_sob_offset;
1801                 *hw_sob = other_sob;
1802
1803                 /*
1804                  * check if other_sob needs reset, then do it before using it
1805                  * for the reservation or the next signal cs.
1806                  * we do it here, and for both encaps and regular signal cs
1807                  * cases in order to avoid possible races of two kref_put
1808                  * of the sob which can occur at the same time if we move the
1809                  * sob reset(kref_put) to cs_do_release function.
1810                  * in addition, if we have combination of cs signal and
1811                  * encaps, and at the point we need to reset the sob there was
1812                  * no more reservations and only signal cs keep coming,
1813                  * in such case we need signal_cs to put the refcount and
1814                  * reset the sob.
1815                  */
1816                 if (other_sob->need_reset)
1817                         hw_sob_put(other_sob);
1818
1819                 if (encaps_sig) {
1820                         /* set reset indication for the sob */
1821                         sob->need_reset = true;
1822                         hw_sob_get(other_sob);
1823                 }
1824
1825                 dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
1826                                 prop->curr_sob_offset, q_idx);
1827         } else {
1828                 prop->next_sob_val += count;
1829         }
1830
1831         return 0;
1832 }
1833
1834 static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
1835                 struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx,
1836                 bool encaps_signals)
1837 {
1838         u64 *signal_seq_arr = NULL;
1839         u32 size_to_copy, signal_seq_arr_len;
1840         int rc = 0;
1841
1842         if (encaps_signals) {
1843                 *signal_seq = chunk->encaps_signal_seq;
1844                 return 0;
1845         }
1846
1847         signal_seq_arr_len = chunk->num_signal_seq_arr;
1848
1849         /* currently only one signal seq is supported */
1850         if (signal_seq_arr_len != 1) {
1851                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1852                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1853                 dev_err(hdev->dev,
1854                         "Wait for signal CS supports only one signal CS seq\n");
1855                 return -EINVAL;
1856         }
1857
1858         signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1859                                         sizeof(*signal_seq_arr),
1860                                         GFP_ATOMIC);
1861         if (!signal_seq_arr)
1862                 signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1863                                         sizeof(*signal_seq_arr),
1864                                         GFP_KERNEL);
1865         if (!signal_seq_arr) {
1866                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1867                 atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1868                 return -ENOMEM;
1869         }
1870
1871         size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
1872         if (copy_from_user(signal_seq_arr,
1873                                 u64_to_user_ptr(chunk->signal_seq_arr),
1874                                 size_to_copy)) {
1875                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1876                 atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1877                 dev_err(hdev->dev,
1878                         "Failed to copy signal seq array from user\n");
1879                 rc = -EFAULT;
1880                 goto out;
1881         }
1882
1883         /* currently it is guaranteed to have only one signal seq */
1884         *signal_seq = signal_seq_arr[0];
1885
1886 out:
1887         kfree(signal_seq_arr);
1888
1889         return rc;
1890 }
1891
1892 static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
1893                 struct hl_ctx *ctx, struct hl_cs *cs,
1894                 enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
1895 {
1896         struct hl_cs_counters_atomic *cntr;
1897         struct hl_cs_job *job;
1898         struct hl_cb *cb;
1899         u32 cb_size;
1900
1901         cntr = &hdev->aggregated_cs_counters;
1902
1903         job = hl_cs_allocate_job(hdev, q_type, true);
1904         if (!job) {
1905                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1906                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
1907                 dev_err(hdev->dev, "Failed to allocate a new job\n");
1908                 return -ENOMEM;
1909         }
1910
1911         if (cs->type == CS_TYPE_WAIT)
1912                 cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
1913         else
1914                 cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
1915
1916         cb = hl_cb_kernel_create(hdev, cb_size,
1917                                 q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
1918         if (!cb) {
1919                 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1920                 atomic64_inc(&cntr->out_of_mem_drop_cnt);
1921                 kfree(job);
1922                 return -EFAULT;
1923         }
1924
1925         job->id = 0;
1926         job->cs = cs;
1927         job->user_cb = cb;
1928         atomic_inc(&job->user_cb->cs_cnt);
1929         job->user_cb_size = cb_size;
1930         job->hw_queue_id = q_idx;
1931
1932         if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
1933                         && cs->encaps_signals)
1934                 job->encaps_sig_wait_offset = encaps_signal_offset;
1935         /*
1936          * No need in parsing, user CB is the patched CB.
1937          * We call hl_cb_destroy() out of two reasons - we don't need the CB in
1938          * the CB idr anymore and to decrement its refcount as it was
1939          * incremented inside hl_cb_kernel_create().
1940          */
1941         job->patched_cb = job->user_cb;
1942         job->job_cb_size = job->user_cb_size;
1943         hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle);
1944
1945         /* increment refcount as for external queues we get completion */
1946         cs_get(cs);
1947
1948         cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1949         cs->jobs_cnt++;
1950
1951         list_add_tail(&job->cs_node, &cs->job_list);
1952
1953         hl_debugfs_add_job(hdev, job);
1954
1955         return 0;
1956 }
1957
1958 static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
1959                                 u32 q_idx, u32 count,
1960                                 u32 *handle_id, u32 *sob_addr,
1961                                 u32 *signals_count)
1962 {
1963         struct hw_queue_properties *hw_queue_prop;
1964         struct hl_sync_stream_properties *prop;
1965         struct hl_device *hdev = hpriv->hdev;
1966         struct hl_cs_encaps_sig_handle *handle;
1967         struct hl_encaps_signals_mgr *mgr;
1968         struct hl_hw_sob *hw_sob;
1969         int hdl_id;
1970         int rc = 0;
1971
1972         if (count >= HL_MAX_SOB_VAL) {
1973                 dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
1974                                                 count);
1975                 rc = -EINVAL;
1976                 goto out;
1977         }
1978
1979         if (q_idx >= hdev->asic_prop.max_queues) {
1980                 dev_err(hdev->dev, "Queue index %d is invalid\n",
1981                         q_idx);
1982                 rc = -EINVAL;
1983                 goto out;
1984         }
1985
1986         hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
1987
1988         if (!hw_queue_prop->supports_sync_stream) {
1989                 dev_err(hdev->dev,
1990                         "Queue index %d does not support sync stream operations\n",
1991                                                                         q_idx);
1992                 rc = -EINVAL;
1993                 goto out;
1994         }
1995
1996         prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1997
1998         handle = kzalloc(sizeof(*handle), GFP_KERNEL);
1999         if (!handle) {
2000                 rc = -ENOMEM;
2001                 goto out;
2002         }
2003
2004         handle->count = count;
2005
2006         hl_ctx_get(hpriv->ctx);
2007         handle->ctx = hpriv->ctx;
2008         mgr = &hpriv->ctx->sig_mgr;
2009
2010         spin_lock(&mgr->lock);
2011         hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC);
2012         spin_unlock(&mgr->lock);
2013
2014         if (hdl_id < 0) {
2015                 dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
2016                 rc = -EINVAL;
2017                 goto put_ctx;
2018         }
2019
2020         handle->id = hdl_id;
2021         handle->q_idx = q_idx;
2022         handle->hdev = hdev;
2023         kref_init(&handle->refcount);
2024
2025         hdev->asic_funcs->hw_queues_lock(hdev);
2026
2027         hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2028
2029         /*
2030          * Increment the SOB value by count by user request
2031          * to reserve those signals
2032          * check if the signals amount to reserve is not exceeding the max sob
2033          * value, if yes then switch sob.
2034          */
2035         rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
2036                                                                 true);
2037         if (rc) {
2038                 dev_err(hdev->dev, "Failed to switch SOB\n");
2039                 hdev->asic_funcs->hw_queues_unlock(hdev);
2040                 rc = -EINVAL;
2041                 goto remove_idr;
2042         }
2043         /* set the hw_sob to the handle after calling the sob wraparound handler
2044          * since sob could have changed.
2045          */
2046         handle->hw_sob = hw_sob;
2047
2048         /* store the current sob value for unreserve validity check, and
2049          * signal offset support
2050          */
2051         handle->pre_sob_val = prop->next_sob_val - handle->count;
2052
2053         handle->cs_seq = ULLONG_MAX;
2054
2055         *signals_count = prop->next_sob_val;
2056         hdev->asic_funcs->hw_queues_unlock(hdev);
2057
2058         *sob_addr = handle->hw_sob->sob_addr;
2059         *handle_id = hdl_id;
2060
2061         dev_dbg(hdev->dev,
2062                 "Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
2063                         hw_sob->sob_id, handle->hw_sob->sob_addr,
2064                         prop->next_sob_val - 1, q_idx, hdl_id);
2065         goto out;
2066
2067 remove_idr:
2068         spin_lock(&mgr->lock);
2069         idr_remove(&mgr->handles, hdl_id);
2070         spin_unlock(&mgr->lock);
2071
2072 put_ctx:
2073         hl_ctx_put(handle->ctx);
2074         kfree(handle);
2075
2076 out:
2077         return rc;
2078 }
2079
2080 static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
2081 {
2082         struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
2083         struct hl_sync_stream_properties *prop;
2084         struct hl_device *hdev = hpriv->hdev;
2085         struct hl_encaps_signals_mgr *mgr;
2086         struct hl_hw_sob *hw_sob;
2087         u32 q_idx, sob_addr;
2088         int rc = 0;
2089
2090         mgr = &hpriv->ctx->sig_mgr;
2091
2092         spin_lock(&mgr->lock);
2093         encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
2094         if (encaps_sig_hdl) {
2095                 dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
2096                                 handle_id, encaps_sig_hdl->hw_sob->sob_addr,
2097                                         encaps_sig_hdl->count);
2098
2099                 hdev->asic_funcs->hw_queues_lock(hdev);
2100
2101                 q_idx = encaps_sig_hdl->q_idx;
2102                 prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
2103                 hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2104                 sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
2105
2106                 /* Check if sob_val got out of sync due to other
2107                  * signal submission requests which were handled
2108                  * between the reserve-unreserve calls or SOB switch
2109                  * upon reaching SOB max value.
2110                  */
2111                 if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
2112                                 != prop->next_sob_val ||
2113                                 sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
2114                         dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
2115                                 encaps_sig_hdl->pre_sob_val,
2116                                 (prop->next_sob_val - encaps_sig_hdl->count));
2117
2118                         hdev->asic_funcs->hw_queues_unlock(hdev);
2119                         rc = -EINVAL;
2120                         goto out;
2121                 }
2122
2123                 /*
2124                  * Decrement the SOB value by count by user request
2125                  * to unreserve those signals
2126                  */
2127                 prop->next_sob_val -= encaps_sig_hdl->count;
2128
2129                 hdev->asic_funcs->hw_queues_unlock(hdev);
2130
2131                 hw_sob_put(hw_sob);
2132
2133                 /* Release the id and free allocated memory of the handle */
2134                 idr_remove(&mgr->handles, handle_id);
2135                 hl_ctx_put(encaps_sig_hdl->ctx);
2136                 kfree(encaps_sig_hdl);
2137         } else {
2138                 rc = -EINVAL;
2139                 dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
2140         }
2141 out:
2142         spin_unlock(&mgr->lock);
2143
2144         return rc;
2145 }
2146
2147 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
2148                                 void __user *chunks, u32 num_chunks,
2149                                 u64 *cs_seq, u32 flags, u32 timeout,
2150                                 u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count)
2151 {
2152         struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
2153         bool handle_found = false, is_wait_cs = false,
2154                         wait_cs_submitted = false,
2155                         cs_encaps_signals = false;
2156         struct hl_cs_chunk *cs_chunk_array, *chunk;
2157         bool staged_cs_with_encaps_signals = false;
2158         struct hw_queue_properties *hw_queue_prop;
2159         struct hl_device *hdev = hpriv->hdev;
2160         struct hl_cs_compl *sig_waitcs_cmpl;
2161         u32 q_idx, collective_engine_id = 0;
2162         struct hl_cs_counters_atomic *cntr;
2163         struct hl_fence *sig_fence = NULL;
2164         struct hl_ctx *ctx = hpriv->ctx;
2165         enum hl_queue_type q_type;
2166         struct hl_cs *cs;
2167         u64 signal_seq;
2168         int rc;
2169
2170         cntr = &hdev->aggregated_cs_counters;
2171         *cs_seq = ULLONG_MAX;
2172
2173         rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
2174                         ctx);
2175         if (rc)
2176                 goto out;
2177
2178         /* currently it is guaranteed to have only one chunk */
2179         chunk = &cs_chunk_array[0];
2180
2181         if (chunk->queue_index >= hdev->asic_prop.max_queues) {
2182                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2183                 atomic64_inc(&cntr->validation_drop_cnt);
2184                 dev_err(hdev->dev, "Queue index %d is invalid\n",
2185                         chunk->queue_index);
2186                 rc = -EINVAL;
2187                 goto free_cs_chunk_array;
2188         }
2189
2190         q_idx = chunk->queue_index;
2191         hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
2192         q_type = hw_queue_prop->type;
2193
2194         if (!hw_queue_prop->supports_sync_stream) {
2195                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2196                 atomic64_inc(&cntr->validation_drop_cnt);
2197                 dev_err(hdev->dev,
2198                         "Queue index %d does not support sync stream operations\n",
2199                         q_idx);
2200                 rc = -EINVAL;
2201                 goto free_cs_chunk_array;
2202         }
2203
2204         if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
2205                 if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
2206                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2207                         atomic64_inc(&cntr->validation_drop_cnt);
2208                         dev_err(hdev->dev,
2209                                 "Queue index %d is invalid\n", q_idx);
2210                         rc = -EINVAL;
2211                         goto free_cs_chunk_array;
2212                 }
2213
2214                 if (!hdev->nic_ports_mask) {
2215                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2216                         atomic64_inc(&cntr->validation_drop_cnt);
2217                         dev_err(hdev->dev,
2218                                 "Collective operations not supported when NIC ports are disabled");
2219                         rc = -EINVAL;
2220                         goto free_cs_chunk_array;
2221                 }
2222
2223                 collective_engine_id = chunk->collective_engine_id;
2224         }
2225
2226         is_wait_cs = !!(cs_type == CS_TYPE_WAIT ||
2227                         cs_type == CS_TYPE_COLLECTIVE_WAIT);
2228
2229         cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
2230
2231         if (is_wait_cs) {
2232                 rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq,
2233                                 ctx, cs_encaps_signals);
2234                 if (rc)
2235                         goto free_cs_chunk_array;
2236
2237                 if (cs_encaps_signals) {
2238                         /* check if cs sequence has encapsulated
2239                          * signals handle
2240                          */
2241                         struct idr *idp;
2242                         u32 id;
2243
2244                         spin_lock(&ctx->sig_mgr.lock);
2245                         idp = &ctx->sig_mgr.handles;
2246                         idr_for_each_entry(idp, encaps_sig_hdl, id) {
2247                                 if (encaps_sig_hdl->cs_seq == signal_seq) {
2248                                         /* get refcount to protect removing this handle from idr,
2249                                          * needed when multiple wait cs are used with offset
2250                                          * to wait on reserved encaps signals.
2251                                          * Since kref_put of this handle is executed outside the
2252                                          * current lock, it is possible that the handle refcount
2253                                          * is 0 but it yet to be removed from the list. In this
2254                                          * case need to consider the handle as not valid.
2255                                          */
2256                                         if (kref_get_unless_zero(&encaps_sig_hdl->refcount))
2257                                                 handle_found = true;
2258                                         break;
2259                                 }
2260                         }
2261                         spin_unlock(&ctx->sig_mgr.lock);
2262
2263                         if (!handle_found) {
2264                                 /* treat as signal CS already finished */
2265                                 dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
2266                                                 signal_seq);
2267                                 rc = 0;
2268                                 goto free_cs_chunk_array;
2269                         }
2270
2271                         /* validate also the signal offset value */
2272                         if (chunk->encaps_signal_offset >
2273                                         encaps_sig_hdl->count) {
2274                                 dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
2275                                                 chunk->encaps_signal_offset,
2276                                                 encaps_sig_hdl->count);
2277                                 rc = -EINVAL;
2278                                 goto free_cs_chunk_array;
2279                         }
2280                 }
2281
2282                 sig_fence = hl_ctx_get_fence(ctx, signal_seq);
2283                 if (IS_ERR(sig_fence)) {
2284                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2285                         atomic64_inc(&cntr->validation_drop_cnt);
2286                         dev_err(hdev->dev,
2287                                 "Failed to get signal CS with seq 0x%llx\n",
2288                                 signal_seq);
2289                         rc = PTR_ERR(sig_fence);
2290                         goto free_cs_chunk_array;
2291                 }
2292
2293                 if (!sig_fence) {
2294                         /* signal CS already finished */
2295                         rc = 0;
2296                         goto free_cs_chunk_array;
2297                 }
2298
2299                 sig_waitcs_cmpl =
2300                         container_of(sig_fence, struct hl_cs_compl, base_fence);
2301
2302                 staged_cs_with_encaps_signals = !!
2303                                 (sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
2304                                 (flags & HL_CS_FLAGS_ENCAP_SIGNALS));
2305
2306                 if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
2307                                 !staged_cs_with_encaps_signals) {
2308                         atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2309                         atomic64_inc(&cntr->validation_drop_cnt);
2310                         dev_err(hdev->dev,
2311                                 "CS seq 0x%llx is not of a signal/encaps-signal CS\n",
2312                                 signal_seq);
2313                         hl_fence_put(sig_fence);
2314                         rc = -EINVAL;
2315                         goto free_cs_chunk_array;
2316                 }
2317
2318                 if (completion_done(&sig_fence->completion)) {
2319                         /* signal CS already finished */
2320                         hl_fence_put(sig_fence);
2321                         rc = 0;
2322                         goto free_cs_chunk_array;
2323                 }
2324         }
2325
2326         rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout);
2327         if (rc) {
2328                 if (is_wait_cs)
2329                         hl_fence_put(sig_fence);
2330
2331                 goto free_cs_chunk_array;
2332         }
2333
2334         /*
2335          * Save the signal CS fence for later initialization right before
2336          * hanging the wait CS on the queue.
2337          * for encaps signals case, we save the cs sequence and handle pointer
2338          * for later initialization.
2339          */
2340         if (is_wait_cs) {
2341                 cs->signal_fence = sig_fence;
2342                 /* store the handle pointer, so we don't have to
2343                  * look for it again, later on the flow
2344                  * when we need to set SOB info in hw_queue.
2345                  */
2346                 if (cs->encaps_signals)
2347                         cs->encaps_sig_hdl = encaps_sig_hdl;
2348         }
2349
2350         hl_debugfs_add_cs(cs);
2351
2352         *cs_seq = cs->sequence;
2353
2354         if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
2355                 rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
2356                                 q_idx, chunk->encaps_signal_offset);
2357         else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
2358                 rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
2359                                 cs, q_idx, collective_engine_id,
2360                                 chunk->encaps_signal_offset);
2361         else {
2362                 atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2363                 atomic64_inc(&cntr->validation_drop_cnt);
2364                 rc = -EINVAL;
2365         }
2366
2367         if (rc)
2368                 goto free_cs_object;
2369
2370         if (q_type == QUEUE_TYPE_HW)
2371                 INIT_WORK(&cs->finish_work, cs_completion);
2372
2373         rc = hl_hw_queue_schedule_cs(cs);
2374         if (rc) {
2375                 /* In case wait cs failed here, it means the signal cs
2376                  * already completed. we want to free all it's related objects
2377                  * but we don't want to fail the ioctl.
2378                  */
2379                 if (is_wait_cs)
2380                         rc = 0;
2381                 else if (rc != -EAGAIN)
2382                         dev_err(hdev->dev,
2383                                 "Failed to submit CS %d.%llu to H/W queues, error %d\n",
2384                                 ctx->asid, cs->sequence, rc);
2385                 goto free_cs_object;
2386         }
2387
2388         *signal_sob_addr_offset = cs->sob_addr_offset;
2389         *signal_initial_sob_count = cs->initial_sob_count;
2390
2391         rc = HL_CS_STATUS_SUCCESS;
2392         if (is_wait_cs)
2393                 wait_cs_submitted = true;
2394         goto put_cs;
2395
2396 free_cs_object:
2397         cs_rollback(hdev, cs);
2398         *cs_seq = ULLONG_MAX;
2399         /* The path below is both for good and erroneous exits */
2400 put_cs:
2401         /* We finished with the CS in this function, so put the ref */
2402         cs_put(cs);
2403 free_cs_chunk_array:
2404         if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
2405                 kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
2406         kfree(cs_chunk_array);
2407 out:
2408         return rc;
2409 }
2410
2411 static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
2412                                                 u32 num_engine_cores, u32 core_command)
2413 {
2414         int rc;
2415         struct hl_device *hdev = hpriv->hdev;
2416         void __user *engine_cores_arr;
2417         u32 *cores;
2418
2419         if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) {
2420                 dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
2421                 return -EINVAL;
2422         }
2423
2424         if (core_command != HL_ENGINE_CORE_RUN && core_command != HL_ENGINE_CORE_HALT) {
2425                 dev_err(hdev->dev, "Engine core command is invalid\n");
2426                 return -EINVAL;
2427         }
2428
2429         engine_cores_arr = (void __user *) (uintptr_t) engine_cores;
2430         cores = kmalloc_array(num_engine_cores, sizeof(u32), GFP_KERNEL);
2431         if (!cores)
2432                 return -ENOMEM;
2433
2434         if (copy_from_user(cores, engine_cores_arr, num_engine_cores * sizeof(u32))) {
2435                 dev_err(hdev->dev, "Failed to copy core-ids array from user\n");
2436                 kfree(cores);
2437                 return -EFAULT;
2438         }
2439
2440         rc = hdev->asic_funcs->set_engine_cores(hdev, cores, num_engine_cores, core_command);
2441         kfree(cores);
2442
2443         return rc;
2444 }
2445
2446 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
2447 {
2448         union hl_cs_args *args = data;
2449         enum hl_cs_type cs_type = 0;
2450         u64 cs_seq = ULONG_MAX;
2451         void __user *chunks;
2452         u32 num_chunks, flags, timeout,
2453                 signals_count = 0, sob_addr = 0, handle_id = 0;
2454         u16 sob_initial_count = 0;
2455         int rc;
2456
2457         rc = hl_cs_sanity_checks(hpriv, args);
2458         if (rc)
2459                 goto out;
2460
2461         rc = hl_cs_ctx_switch(hpriv, args, &cs_seq);
2462         if (rc)
2463                 goto out;
2464
2465         cs_type = hl_cs_get_cs_type(args->in.cs_flags &
2466                                         ~HL_CS_FLAGS_FORCE_RESTORE);
2467         chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
2468         num_chunks = args->in.num_chunks_execute;
2469         flags = args->in.cs_flags;
2470
2471         /* In case this is a staged CS, user should supply the CS sequence */
2472         if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
2473                         !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
2474                 cs_seq = args->in.seq;
2475
2476         timeout = flags & HL_CS_FLAGS_CUSTOM_TIMEOUT
2477                         ? msecs_to_jiffies(args->in.timeout * 1000)
2478                         : hpriv->hdev->timeout_jiffies;
2479
2480         switch (cs_type) {
2481         case CS_TYPE_SIGNAL:
2482         case CS_TYPE_WAIT:
2483         case CS_TYPE_COLLECTIVE_WAIT:
2484                 rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
2485                                         &cs_seq, args->in.cs_flags, timeout,
2486                                         &sob_addr, &sob_initial_count);
2487                 break;
2488         case CS_RESERVE_SIGNALS:
2489                 rc = cs_ioctl_reserve_signals(hpriv,
2490                                         args->in.encaps_signals_q_idx,
2491                                         args->in.encaps_signals_count,
2492                                         &handle_id, &sob_addr, &signals_count);
2493                 break;
2494         case CS_UNRESERVE_SIGNALS:
2495                 rc = cs_ioctl_unreserve_signals(hpriv,
2496                                         args->in.encaps_sig_handle_id);
2497                 break;
2498         case CS_TYPE_ENGINE_CORE:
2499                 rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores,
2500                                 args->in.num_engine_cores, args->in.core_command);
2501                 break;
2502         default:
2503                 rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
2504                                                 args->in.cs_flags,
2505                                                 args->in.encaps_sig_handle_id,
2506                                                 timeout, &sob_initial_count);
2507                 break;
2508         }
2509 out:
2510         if (rc != -EAGAIN) {
2511                 memset(args, 0, sizeof(*args));
2512
2513                 switch (cs_type) {
2514                 case CS_RESERVE_SIGNALS:
2515                         args->out.handle_id = handle_id;
2516                         args->out.sob_base_addr_offset = sob_addr;
2517                         args->out.count = signals_count;
2518                         break;
2519                 case CS_TYPE_SIGNAL:
2520                         args->out.sob_base_addr_offset = sob_addr;
2521                         args->out.sob_count_before_submission = sob_initial_count;
2522                         args->out.seq = cs_seq;
2523                         break;
2524                 case CS_TYPE_DEFAULT:
2525                         args->out.sob_count_before_submission = sob_initial_count;
2526                         args->out.seq = cs_seq;
2527                         break;
2528                 default:
2529                         args->out.seq = cs_seq;
2530                         break;
2531                 }
2532
2533                 args->out.status = rc;
2534         }
2535
2536         return rc;
2537 }
2538
2539 static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence,
2540                                 enum hl_cs_wait_status *status, u64 timeout_us, s64 *timestamp)
2541 {
2542         struct hl_device *hdev = ctx->hdev;
2543         ktime_t timestamp_kt;
2544         long completion_rc;
2545         int rc = 0, error;
2546
2547         if (IS_ERR(fence)) {
2548                 rc = PTR_ERR(fence);
2549                 if (rc == -EINVAL)
2550                         dev_notice_ratelimited(hdev->dev,
2551                                 "Can't wait on CS %llu because current CS is at seq %llu\n",
2552                                 seq, ctx->cs_sequence);
2553                 return rc;
2554         }
2555
2556         if (!fence) {
2557                 if (!hl_pop_cs_outcome(&ctx->outcome_store, seq, &timestamp_kt, &error)) {
2558                         dev_dbg(hdev->dev,
2559                                 "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
2560                                 seq, ctx->cs_sequence);
2561                         *status = CS_WAIT_STATUS_GONE;
2562                         return 0;
2563                 }
2564
2565                 completion_rc = 1;
2566                 goto report_results;
2567         }
2568
2569         if (!timeout_us) {
2570                 completion_rc = completion_done(&fence->completion);
2571         } else {
2572                 unsigned long timeout;
2573
2574                 timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
2575                                 timeout_us : usecs_to_jiffies(timeout_us);
2576                 completion_rc =
2577                         wait_for_completion_interruptible_timeout(
2578                                 &fence->completion, timeout);
2579         }
2580
2581         error = fence->error;
2582         timestamp_kt = fence->timestamp;
2583
2584 report_results:
2585         if (completion_rc > 0) {
2586                 *status = CS_WAIT_STATUS_COMPLETED;
2587                 if (timestamp)
2588                         *timestamp = ktime_to_ns(timestamp_kt);
2589         } else {
2590                 *status = CS_WAIT_STATUS_BUSY;
2591         }
2592
2593         if (completion_rc == -ERESTARTSYS)
2594                 rc = completion_rc;
2595         else if (error == -ETIMEDOUT || error == -EIO)
2596                 rc = error;
2597
2598         return rc;
2599 }
2600
2601 /*
2602  * hl_cs_poll_fences - iterate CS fences to check for CS completion
2603  *
2604  * @mcs_data: multi-CS internal data
2605  * @mcs_compl: multi-CS completion structure
2606  *
2607  * @return 0 on success, otherwise non 0 error code
2608  *
2609  * The function iterates on all CS sequence in the list and set bit in
2610  * completion_bitmap for each completed CS.
2611  * While iterating, the function sets the stream map of each fence in the fence
2612  * array in the completion QID stream map to be used by CSs to perform
2613  * completion to the multi-CS context.
2614  * This function shall be called after taking context ref
2615  */
2616 static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl)
2617 {
2618         struct hl_fence **fence_ptr = mcs_data->fence_arr;
2619         struct hl_device *hdev = mcs_data->ctx->hdev;
2620         int i, rc, arr_len = mcs_data->arr_len;
2621         u64 *seq_arr = mcs_data->seq_arr;
2622         ktime_t max_ktime, first_cs_time;
2623         enum hl_cs_wait_status status;
2624
2625         memset(fence_ptr, 0, arr_len * sizeof(struct hl_fence *));
2626
2627         /* get all fences under the same lock */
2628         rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
2629         if (rc)
2630                 return rc;
2631
2632         /*
2633          * re-initialize the completion here to handle 2 possible cases:
2634          * 1. CS will complete the multi-CS prior clearing the completion. in which
2635          *    case the fence iteration is guaranteed to catch the CS completion.
2636          * 2. the completion will occur after re-init of the completion.
2637          *    in which case we will wake up immediately in wait_for_completion.
2638          */
2639         reinit_completion(&mcs_compl->completion);
2640
2641         /*
2642          * set to maximum time to verify timestamp is valid: if at the end
2643          * this value is maintained- no timestamp was updated
2644          */
2645         max_ktime = ktime_set(KTIME_SEC_MAX, 0);
2646         first_cs_time = max_ktime;
2647
2648         for (i = 0; i < arr_len; i++, fence_ptr++) {
2649                 struct hl_fence *fence = *fence_ptr;
2650
2651                 /*
2652                  * In order to prevent case where we wait until timeout even though a CS associated
2653                  * with the multi-CS actually completed we do things in the below order:
2654                  * 1. for each fence set it's QID map in the multi-CS completion QID map. This way
2655                  *    any CS can, potentially, complete the multi CS for the specific QID (note
2656                  *    that once completion is initialized, calling complete* and then wait on the
2657                  *    completion will cause it to return at once)
2658                  * 2. only after allowing multi-CS completion for the specific QID we check whether
2659                  *    the specific CS already completed (and thus the wait for completion part will
2660                  *    be skipped). if the CS not completed it is guaranteed that completing CS will
2661                  *    wake up the completion.
2662                  */
2663                 if (fence)
2664                         mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map;
2665
2666                 /*
2667                  * function won't sleep as it is called with timeout 0 (i.e.
2668                  * poll the fence)
2669                  */
2670                 rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence, &status, 0, NULL);
2671                 if (rc) {
2672                         dev_err(hdev->dev,
2673                                 "wait_for_fence error :%d for CS seq %llu\n",
2674                                                                 rc, seq_arr[i]);
2675                         break;
2676                 }
2677
2678                 switch (status) {
2679                 case CS_WAIT_STATUS_BUSY:
2680                         /* CS did not finished, QID to wait on already stored */
2681                         break;
2682                 case CS_WAIT_STATUS_COMPLETED:
2683                         /*
2684                          * Using mcs_handling_done to avoid possibility of mcs_data
2685                          * returns to user indicating CS completed before it finished
2686                          * all of its mcs handling, to avoid race the next time the
2687                          * user waits for mcs.
2688                          * note: when reaching this case fence is definitely not NULL
2689                          *       but NULL check was added to overcome static analysis
2690                          */
2691                         if (fence && !fence->mcs_handling_done) {
2692                                 /*
2693                                  * in case multi CS is completed but MCS handling not done
2694                                  * we "complete" the multi CS to prevent it from waiting
2695                                  * until time-out and the "multi-CS handling done" will have
2696                                  * another chance at the next iteration
2697                                  */
2698                                 complete_all(&mcs_compl->completion);
2699                                 break;
2700                         }
2701
2702                         mcs_data->completion_bitmap |= BIT(i);
2703                         /*
2704                          * For all completed CSs we take the earliest timestamp.
2705                          * For this we have to validate that the timestamp is
2706                          * earliest of all timestamps so far.
2707                          */
2708                         if (fence && mcs_data->update_ts &&
2709                                         (ktime_compare(fence->timestamp, first_cs_time) < 0))
2710                                 first_cs_time = fence->timestamp;
2711                         break;
2712                 case CS_WAIT_STATUS_GONE:
2713                         mcs_data->update_ts = false;
2714                         mcs_data->gone_cs = true;
2715                         /*
2716                          * It is possible to get an old sequence numbers from user
2717                          * which related to already completed CSs and their fences
2718                          * already gone. In this case, CS set as completed but
2719                          * no need to consider its QID for mcs completion.
2720                          */
2721                         mcs_data->completion_bitmap |= BIT(i);
2722                         break;
2723                 default:
2724                         dev_err(hdev->dev, "Invalid fence status\n");
2725                         rc = -EINVAL;
2726                         break;
2727                 }
2728
2729         }
2730
2731         hl_fences_put(mcs_data->fence_arr, arr_len);
2732
2733         if (mcs_data->update_ts &&
2734                         (ktime_compare(first_cs_time, max_ktime) != 0))
2735                 mcs_data->timestamp = ktime_to_ns(first_cs_time);
2736
2737         return rc;
2738 }
2739
2740 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq,
2741                                 enum hl_cs_wait_status *status, s64 *timestamp)
2742 {
2743         struct hl_fence *fence;
2744         int rc = 0;
2745
2746         if (timestamp)
2747                 *timestamp = 0;
2748
2749         hl_ctx_get(ctx);
2750
2751         fence = hl_ctx_get_fence(ctx, seq);
2752
2753         rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
2754         hl_fence_put(fence);
2755         hl_ctx_put(ctx);
2756
2757         return rc;
2758 }
2759
2760 static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
2761 {
2762         if (usecs <= U32_MAX)
2763                 return usecs_to_jiffies(usecs);
2764
2765         /*
2766          * If the value in nanoseconds is larger than 64 bit, use the largest
2767          * 64 bit value.
2768          */
2769         if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
2770                 return nsecs_to_jiffies(U64_MAX);
2771
2772         return nsecs_to_jiffies(usecs * NSEC_PER_USEC);
2773 }
2774
2775 /*
2776  * hl_wait_multi_cs_completion_init - init completion structure
2777  *
2778  * @hdev: pointer to habanalabs device structure
2779  * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
2780  *                        master QID to wait on
2781  *
2782  * @return valid completion struct pointer on success, otherwise error pointer
2783  *
2784  * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
2785  * the function gets the first available completion (by marking it "used")
2786  * and initialize its values.
2787  */
2788 static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev)
2789 {
2790         struct multi_cs_completion *mcs_compl;
2791         int i;
2792
2793         /* find free multi_cs completion structure */
2794         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2795                 mcs_compl = &hdev->multi_cs_completion[i];
2796                 spin_lock(&mcs_compl->lock);
2797                 if (!mcs_compl->used) {
2798                         mcs_compl->used = 1;
2799                         mcs_compl->timestamp = 0;
2800                         /*
2801                          * init QID map to 0 to avoid completion by CSs. the actual QID map
2802                          * to multi-CS CSs will be set incrementally at a later stage
2803                          */
2804                         mcs_compl->stream_master_qid_map = 0;
2805                         spin_unlock(&mcs_compl->lock);
2806                         break;
2807                 }
2808                 spin_unlock(&mcs_compl->lock);
2809         }
2810
2811         if (i == MULTI_CS_MAX_USER_CTX) {
2812                 dev_err(hdev->dev, "no available multi-CS completion structure\n");
2813                 return ERR_PTR(-ENOMEM);
2814         }
2815         return mcs_compl;
2816 }
2817
2818 /*
2819  * hl_wait_multi_cs_completion_fini - return completion structure and set as
2820  *                                    unused
2821  *
2822  * @mcs_compl: pointer to the completion structure
2823  */
2824 static void hl_wait_multi_cs_completion_fini(
2825                                         struct multi_cs_completion *mcs_compl)
2826 {
2827         /*
2828          * free completion structure, do it under lock to be in-sync with the
2829          * thread that signals completion
2830          */
2831         spin_lock(&mcs_compl->lock);
2832         mcs_compl->used = 0;
2833         spin_unlock(&mcs_compl->lock);
2834 }
2835
2836 /*
2837  * hl_wait_multi_cs_completion - wait for first CS to complete
2838  *
2839  * @mcs_data: multi-CS internal data
2840  *
2841  * @return 0 on success, otherwise non 0 error code
2842  */
2843 static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data,
2844                                                 struct multi_cs_completion *mcs_compl)
2845 {
2846         long completion_rc;
2847
2848         completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion,
2849                                                                         mcs_data->timeout_jiffies);
2850
2851         /* update timestamp */
2852         if (completion_rc > 0)
2853                 mcs_data->timestamp = mcs_compl->timestamp;
2854
2855         if (completion_rc == -ERESTARTSYS)
2856                 return completion_rc;
2857
2858         mcs_data->wait_status = completion_rc;
2859
2860         return 0;
2861 }
2862
2863 /*
2864  * hl_multi_cs_completion_init - init array of multi-CS completion structures
2865  *
2866  * @hdev: pointer to habanalabs device structure
2867  */
2868 void hl_multi_cs_completion_init(struct hl_device *hdev)
2869 {
2870         struct multi_cs_completion *mcs_cmpl;
2871         int i;
2872
2873         for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2874                 mcs_cmpl = &hdev->multi_cs_completion[i];
2875                 mcs_cmpl->used = 0;
2876                 spin_lock_init(&mcs_cmpl->lock);
2877                 init_completion(&mcs_cmpl->completion);
2878         }
2879 }
2880
2881 /*
2882  * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
2883  *
2884  * @hpriv: pointer to the private data of the fd
2885  * @data: pointer to multi-CS wait ioctl in/out args
2886  *
2887  */
2888 static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
2889 {
2890         struct multi_cs_completion *mcs_compl;
2891         struct hl_device *hdev = hpriv->hdev;
2892         struct multi_cs_data mcs_data = {};
2893         union hl_wait_cs_args *args = data;
2894         struct hl_ctx *ctx = hpriv->ctx;
2895         struct hl_fence **fence_arr;
2896         void __user *seq_arr;
2897         u32 size_to_copy;
2898         u64 *cs_seq_arr;
2899         u8 seq_arr_len;
2900         int rc;
2901
2902         if (!hdev->supports_wait_for_multi_cs) {
2903                 dev_err(hdev->dev, "Wait for multi CS is not supported\n");
2904                 return -EPERM;
2905         }
2906
2907         seq_arr_len = args->in.seq_arr_len;
2908
2909         if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
2910                 dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
2911                                 HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
2912                 return -EINVAL;
2913         }
2914
2915         /* allocate memory for sequence array */
2916         cs_seq_arr =
2917                 kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
2918         if (!cs_seq_arr)
2919                 return -ENOMEM;
2920
2921         /* copy CS sequence array from user */
2922         seq_arr = (void __user *) (uintptr_t) args->in.seq;
2923         size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
2924         if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
2925                 dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
2926                 rc = -EFAULT;
2927                 goto free_seq_arr;
2928         }
2929
2930         /* allocate array for the fences */
2931         fence_arr = kmalloc_array(seq_arr_len, sizeof(struct hl_fence *), GFP_KERNEL);
2932         if (!fence_arr) {
2933                 rc = -ENOMEM;
2934                 goto free_seq_arr;
2935         }
2936
2937         /* initialize the multi-CS internal data */
2938         mcs_data.ctx = ctx;
2939         mcs_data.seq_arr = cs_seq_arr;
2940         mcs_data.fence_arr = fence_arr;
2941         mcs_data.arr_len = seq_arr_len;
2942
2943         hl_ctx_get(ctx);
2944
2945         /* wait (with timeout) for the first CS to be completed */
2946         mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us);
2947         mcs_compl = hl_wait_multi_cs_completion_init(hdev);
2948         if (IS_ERR(mcs_compl)) {
2949                 rc = PTR_ERR(mcs_compl);
2950                 goto put_ctx;
2951         }
2952
2953         /* poll all CS fences, extract timestamp */
2954         mcs_data.update_ts = true;
2955         rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
2956         /*
2957          * skip wait for CS completion when one of the below is true:
2958          * - an error on the poll function
2959          * - one or more CS in the list completed
2960          * - the user called ioctl with timeout 0
2961          */
2962         if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
2963                 goto completion_fini;
2964
2965         while (true) {
2966                 rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl);
2967                 if (rc || (mcs_data.wait_status == 0))
2968                         break;
2969
2970                 /*
2971                  * poll fences once again to update the CS map.
2972                  * no timestamp should be updated this time.
2973                  */
2974                 mcs_data.update_ts = false;
2975                 rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
2976
2977                 if (rc || mcs_data.completion_bitmap)
2978                         break;
2979
2980                 /*
2981                  * if hl_wait_multi_cs_completion returned before timeout (i.e.
2982                  * it got a completion) it either got completed by CS in the multi CS list
2983                  * (in which case the indication will be non empty completion_bitmap) or it
2984                  * got completed by CS submitted to one of the shared stream master but
2985                  * not in the multi CS list (in which case we should wait again but modify
2986                  * the timeout and set timestamp as zero to let a CS related to the current
2987                  * multi-CS set a new, relevant, timestamp)
2988                  */
2989                 mcs_data.timeout_jiffies = mcs_data.wait_status;
2990                 mcs_compl->timestamp = 0;
2991         }
2992
2993 completion_fini:
2994         hl_wait_multi_cs_completion_fini(mcs_compl);
2995
2996 put_ctx:
2997         hl_ctx_put(ctx);
2998         kfree(fence_arr);
2999
3000 free_seq_arr:
3001         kfree(cs_seq_arr);
3002
3003         if (rc == -ERESTARTSYS) {
3004                 dev_err_ratelimited(hdev->dev,
3005                                 "user process got signal while waiting for Multi-CS\n");
3006                 rc = -EINTR;
3007         }
3008
3009         if (rc)
3010                 return rc;
3011
3012         /* update output args */
3013         memset(args, 0, sizeof(*args));
3014
3015         if (mcs_data.completion_bitmap) {
3016                 args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3017                 args->out.cs_completion_map = mcs_data.completion_bitmap;
3018
3019                 /* if timestamp not 0- it's valid */
3020                 if (mcs_data.timestamp) {
3021                         args->out.timestamp_nsec = mcs_data.timestamp;
3022                         args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3023                 }
3024
3025                 /* update if some CS was gone */
3026                 if (!mcs_data.timestamp)
3027                         args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
3028         } else {
3029                 args->out.status = HL_WAIT_CS_STATUS_BUSY;
3030         }
3031
3032         return 0;
3033 }
3034
3035 static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3036 {
3037         struct hl_device *hdev = hpriv->hdev;
3038         union hl_wait_cs_args *args = data;
3039         enum hl_cs_wait_status status;
3040         u64 seq = args->in.seq;
3041         s64 timestamp;
3042         int rc;
3043
3044         rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq, &status, &timestamp);
3045
3046         if (rc == -ERESTARTSYS) {
3047                 dev_err_ratelimited(hdev->dev,
3048                         "user process got signal while waiting for CS handle %llu\n",
3049                         seq);
3050                 return -EINTR;
3051         }
3052
3053         memset(args, 0, sizeof(*args));
3054
3055         if (rc) {
3056                 if (rc == -ETIMEDOUT) {
3057                         dev_err_ratelimited(hdev->dev,
3058                                 "CS %llu has timed-out while user process is waiting for it\n",
3059                                 seq);
3060                         args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
3061                 } else if (rc == -EIO) {
3062                         dev_err_ratelimited(hdev->dev,
3063                                 "CS %llu has been aborted while user process is waiting for it\n",
3064                                 seq);
3065                         args->out.status = HL_WAIT_CS_STATUS_ABORTED;
3066                 }
3067                 return rc;
3068         }
3069
3070         if (timestamp) {
3071                 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3072                 args->out.timestamp_nsec = timestamp;
3073         }
3074
3075         switch (status) {
3076         case CS_WAIT_STATUS_GONE:
3077                 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
3078                 fallthrough;
3079         case CS_WAIT_STATUS_COMPLETED:
3080                 args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3081                 break;
3082         case CS_WAIT_STATUS_BUSY:
3083         default:
3084                 args->out.status = HL_WAIT_CS_STATUS_BUSY;
3085                 break;
3086         }
3087
3088         return 0;
3089 }
3090
3091 static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
3092                                         struct hl_cb *cq_cb,
3093                                         u64 ts_offset, u64 cq_offset, u64 target_value,
3094                                         spinlock_t *wait_list_lock,
3095                                         struct hl_user_pending_interrupt **pend)
3096 {
3097         struct hl_ts_buff *ts_buff = buf->private;
3098         struct hl_user_pending_interrupt *requested_offset_record =
3099                                 (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3100                                 ts_offset;
3101         struct hl_user_pending_interrupt *cb_last =
3102                         (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3103                         (ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
3104         unsigned long flags, iter_counter = 0;
3105         u64 current_cq_counter;
3106
3107         /* Validate ts_offset not exceeding last max */
3108         if (requested_offset_record >= cb_last) {
3109                 dev_err(buf->mmg->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
3110                                                                 (u64)(uintptr_t)cb_last);
3111                 return -EINVAL;
3112         }
3113
3114 start_over:
3115         spin_lock_irqsave(wait_list_lock, flags);
3116
3117         /* Unregister only if we didn't reach the target value
3118          * since in this case there will be no handling in irq context
3119          * and then it's safe to delete the node out of the interrupt list
3120          * then re-use it on other interrupt
3121          */
3122         if (requested_offset_record->ts_reg_info.in_use) {
3123                 current_cq_counter = *requested_offset_record->cq_kernel_addr;
3124                 if (current_cq_counter < requested_offset_record->cq_target_value) {
3125                         list_del(&requested_offset_record->wait_list_node);
3126                         spin_unlock_irqrestore(wait_list_lock, flags);
3127
3128                         hl_mmap_mem_buf_put(requested_offset_record->ts_reg_info.buf);
3129                         hl_cb_put(requested_offset_record->ts_reg_info.cq_cb);
3130
3131                         dev_dbg(buf->mmg->dev,
3132                                 "ts node removed from interrupt list now can re-use\n");
3133                 } else {
3134                         dev_dbg(buf->mmg->dev,
3135                                 "ts node in middle of irq handling\n");
3136
3137                         /* irq handling in the middle give it time to finish */
3138                         spin_unlock_irqrestore(wait_list_lock, flags);
3139                         usleep_range(1, 10);
3140                         if (++iter_counter == MAX_TS_ITER_NUM) {
3141                                 dev_err(buf->mmg->dev,
3142                                         "handling registration interrupt took too long!!\n");
3143                                 return -EINVAL;
3144                         }
3145
3146                         goto start_over;
3147                 }
3148         } else {
3149                 spin_unlock_irqrestore(wait_list_lock, flags);
3150         }
3151
3152         /* Fill up the new registration node info */
3153         requested_offset_record->ts_reg_info.in_use = 1;
3154         requested_offset_record->ts_reg_info.buf = buf;
3155         requested_offset_record->ts_reg_info.cq_cb = cq_cb;
3156         requested_offset_record->ts_reg_info.timestamp_kernel_addr =
3157                         (u64 *) ts_buff->user_buff_address + ts_offset;
3158         requested_offset_record->cq_kernel_addr =
3159                         (u64 *) cq_cb->kernel_address + cq_offset;
3160         requested_offset_record->cq_target_value = target_value;
3161
3162         *pend = requested_offset_record;
3163
3164         dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB %p\n",
3165                 requested_offset_record);
3166         return 0;
3167 }
3168
3169 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
3170                                 struct hl_mem_mgr *cb_mmg, struct hl_mem_mgr *mmg,
3171                                 u64 timeout_us, u64 cq_counters_handle, u64 cq_counters_offset,
3172                                 u64 target_value, struct hl_user_interrupt *interrupt,
3173                                 bool register_ts_record, u64 ts_handle, u64 ts_offset,
3174                                 u32 *status, u64 *timestamp)
3175 {
3176         struct hl_user_pending_interrupt *pend;
3177         struct hl_mmap_mem_buf *buf;
3178         struct hl_cb *cq_cb;
3179         unsigned long timeout, flags;
3180         long completion_rc;
3181         int rc = 0;
3182
3183         timeout = hl_usecs64_to_jiffies(timeout_us);
3184
3185         hl_ctx_get(ctx);
3186
3187         cq_cb = hl_cb_get(cb_mmg, cq_counters_handle);
3188         if (!cq_cb) {
3189                 rc = -EINVAL;
3190                 goto put_ctx;
3191         }
3192
3193         /* Validate the cq offset */
3194         if (((u64 *) cq_cb->kernel_address + cq_counters_offset) >=
3195                         ((u64 *) cq_cb->kernel_address + (cq_cb->size / sizeof(u64)))) {
3196                 rc = -EINVAL;
3197                 goto put_cq_cb;
3198         }
3199
3200         if (register_ts_record) {
3201                 dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n",
3202                                         interrupt->interrupt_id, ts_offset, cq_counters_offset);
3203                 buf = hl_mmap_mem_buf_get(mmg, ts_handle);
3204                 if (!buf) {
3205                         rc = -EINVAL;
3206                         goto put_cq_cb;
3207                 }
3208
3209                 /* Find first available record */
3210                 rc = ts_buff_get_kernel_ts_record(buf, cq_cb, ts_offset,
3211                                                 cq_counters_offset, target_value,
3212                                                 &interrupt->wait_list_lock, &pend);
3213                 if (rc)
3214                         goto put_ts_buff;
3215         } else {
3216                 pend = kzalloc(sizeof(*pend), GFP_KERNEL);
3217                 if (!pend) {
3218                         rc = -ENOMEM;
3219                         goto put_cq_cb;
3220                 }
3221                 hl_fence_init(&pend->fence, ULONG_MAX);
3222                 pend->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_counters_offset;
3223                 pend->cq_target_value = target_value;
3224         }
3225
3226         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3227
3228         /* We check for completion value as interrupt could have been received
3229          * before we added the node to the wait list
3230          */
3231         if (*pend->cq_kernel_addr >= target_value) {
3232                 if (register_ts_record)
3233                         pend->ts_reg_info.in_use = 0;
3234                 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3235
3236                 *status = HL_WAIT_CS_STATUS_COMPLETED;
3237
3238                 if (register_ts_record) {
3239                         *pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
3240                         goto put_ts_buff;
3241                 } else {
3242                         pend->fence.timestamp = ktime_get();
3243                         goto set_timestamp;
3244                 }
3245         } else if (!timeout_us) {
3246                 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3247                 *status = HL_WAIT_CS_STATUS_BUSY;
3248                 pend->fence.timestamp = ktime_get();
3249                 goto set_timestamp;
3250         }
3251
3252         /* Add pending user interrupt to relevant list for the interrupt
3253          * handler to monitor.
3254          * Note that we cannot have sorted list by target value,
3255          * in order to shorten the list pass loop, since
3256          * same list could have nodes for different cq counter handle.
3257          */
3258         list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
3259         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3260
3261         if (register_ts_record) {
3262                 rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
3263                 goto ts_registration_exit;
3264         }
3265
3266         /* Wait for interrupt handler to signal completion */
3267         completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3268                                                                 timeout);
3269         if (completion_rc > 0) {
3270                 *status = HL_WAIT_CS_STATUS_COMPLETED;
3271         } else {
3272                 if (completion_rc == -ERESTARTSYS) {
3273                         dev_err_ratelimited(hdev->dev,
3274                                         "user process got signal while waiting for interrupt ID %d\n",
3275                                         interrupt->interrupt_id);
3276                         rc = -EINTR;
3277                         *status = HL_WAIT_CS_STATUS_ABORTED;
3278                 } else {
3279                         if (pend->fence.error == -EIO) {
3280                                 dev_err_ratelimited(hdev->dev,
3281                                                 "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3282                                                 pend->fence.error);
3283                                 rc = -EIO;
3284                                 *status = HL_WAIT_CS_STATUS_ABORTED;
3285                         } else {
3286                                 /* The wait has timed-out. We don't know anything beyond that
3287                                  * because the workload wasn't submitted through the driver.
3288                                  * Therefore, from driver's perspective, the workload is still
3289                                  * executing.
3290                                  */
3291                                 rc = 0;
3292                                 *status = HL_WAIT_CS_STATUS_BUSY;
3293                         }
3294                 }
3295         }
3296
3297         /*
3298          * We keep removing the node from list here, and not at the irq handler
3299          * for completion timeout case. and if it's a registration
3300          * for ts record, the node will be deleted in the irq handler after
3301          * we reach the target value.
3302          */
3303         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3304         list_del(&pend->wait_list_node);
3305         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3306
3307 set_timestamp:
3308         *timestamp = ktime_to_ns(pend->fence.timestamp);
3309         kfree(pend);
3310         hl_cb_put(cq_cb);
3311 ts_registration_exit:
3312         hl_ctx_put(ctx);
3313
3314         return rc;
3315
3316 put_ts_buff:
3317         hl_mmap_mem_buf_put(buf);
3318 put_cq_cb:
3319         hl_cb_put(cq_cb);
3320 put_ctx:
3321         hl_ctx_put(ctx);
3322
3323         return rc;
3324 }
3325
3326 static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx,
3327                                 u64 timeout_us, u64 user_address,
3328                                 u64 target_value, struct hl_user_interrupt *interrupt,
3329                                 u32 *status,
3330                                 u64 *timestamp)
3331 {
3332         struct hl_user_pending_interrupt *pend;
3333         unsigned long timeout, flags;
3334         u64 completion_value;
3335         long completion_rc;
3336         int rc = 0;
3337
3338         timeout = hl_usecs64_to_jiffies(timeout_us);
3339
3340         hl_ctx_get(ctx);
3341
3342         pend = kzalloc(sizeof(*pend), GFP_KERNEL);
3343         if (!pend) {
3344                 hl_ctx_put(ctx);
3345                 return -ENOMEM;
3346         }
3347
3348         hl_fence_init(&pend->fence, ULONG_MAX);
3349
3350         /* Add pending user interrupt to relevant list for the interrupt
3351          * handler to monitor
3352          */
3353         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3354         list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
3355         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3356
3357         /* We check for completion value as interrupt could have been received
3358          * before we added the node to the wait list
3359          */
3360         if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3361                 dev_err(hdev->dev, "Failed to copy completion value from user\n");
3362                 rc = -EFAULT;
3363                 goto remove_pending_user_interrupt;
3364         }
3365
3366         if (completion_value >= target_value) {
3367                 *status = HL_WAIT_CS_STATUS_COMPLETED;
3368                 /* There was no interrupt, we assume the completion is now. */
3369                 pend->fence.timestamp = ktime_get();
3370         } else {
3371                 *status = HL_WAIT_CS_STATUS_BUSY;
3372         }
3373
3374         if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
3375                 goto remove_pending_user_interrupt;
3376
3377 wait_again:
3378         /* Wait for interrupt handler to signal completion */
3379         completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3380                                                                                 timeout);
3381
3382         /* If timeout did not expire we need to perform the comparison.
3383          * If comparison fails, keep waiting until timeout expires
3384          */
3385         if (completion_rc > 0) {
3386                 spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3387                 /* reinit_completion must be called before we check for user
3388                  * completion value, otherwise, if interrupt is received after
3389                  * the comparison and before the next wait_for_completion,
3390                  * we will reach timeout and fail
3391                  */
3392                 reinit_completion(&pend->fence.completion);
3393                 spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3394
3395                 if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3396                         dev_err(hdev->dev, "Failed to copy completion value from user\n");
3397                         rc = -EFAULT;
3398
3399                         goto remove_pending_user_interrupt;
3400                 }
3401
3402                 if (completion_value >= target_value) {
3403                         *status = HL_WAIT_CS_STATUS_COMPLETED;
3404                 } else if (pend->fence.error) {
3405                         dev_err_ratelimited(hdev->dev,
3406                                 "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3407                                 pend->fence.error);
3408                         /* set the command completion status as ABORTED */
3409                         *status = HL_WAIT_CS_STATUS_ABORTED;
3410                 } else {
3411                         timeout = completion_rc;
3412                         goto wait_again;
3413                 }
3414         } else if (completion_rc == -ERESTARTSYS) {
3415                 dev_err_ratelimited(hdev->dev,
3416                         "user process got signal while waiting for interrupt ID %d\n",
3417                         interrupt->interrupt_id);
3418                 rc = -EINTR;
3419         } else {
3420                 /* The wait has timed-out. We don't know anything beyond that
3421                  * because the workload wasn't submitted through the driver.
3422                  * Therefore, from driver's perspective, the workload is still
3423                  * executing.
3424                  */
3425                 rc = 0;
3426                 *status = HL_WAIT_CS_STATUS_BUSY;
3427         }
3428
3429 remove_pending_user_interrupt:
3430         spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3431         list_del(&pend->wait_list_node);
3432         spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3433
3434         *timestamp = ktime_to_ns(pend->fence.timestamp);
3435
3436         kfree(pend);
3437         hl_ctx_put(ctx);
3438
3439         return rc;
3440 }
3441
3442 static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3443 {
3444         u16 interrupt_id, first_interrupt, last_interrupt;
3445         struct hl_device *hdev = hpriv->hdev;
3446         struct asic_fixed_properties *prop;
3447         struct hl_user_interrupt *interrupt;
3448         union hl_wait_cs_args *args = data;
3449         u32 status = HL_WAIT_CS_STATUS_BUSY;
3450         u64 timestamp = 0;
3451         int rc, int_idx;
3452
3453         prop = &hdev->asic_prop;
3454
3455         if (!(prop->user_interrupt_count + prop->user_dec_intr_count)) {
3456                 dev_err(hdev->dev, "no user interrupts allowed");
3457                 return -EPERM;
3458         }
3459
3460         interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
3461
3462         first_interrupt = prop->first_available_user_interrupt;
3463         last_interrupt = prop->first_available_user_interrupt + prop->user_interrupt_count - 1;
3464
3465         if (interrupt_id < prop->user_dec_intr_count) {
3466
3467                 /* Check if the requested core is enabled */
3468                 if (!(prop->decoder_enabled_mask & BIT(interrupt_id))) {
3469                         dev_err(hdev->dev, "interrupt on a disabled core(%u) not allowed",
3470                                 interrupt_id);
3471                         return -EINVAL;
3472                 }
3473
3474                 interrupt = &hdev->user_interrupt[interrupt_id];
3475
3476         } else if (interrupt_id >= first_interrupt && interrupt_id <= last_interrupt) {
3477
3478                 int_idx = interrupt_id - first_interrupt + prop->user_dec_intr_count;
3479                 interrupt = &hdev->user_interrupt[int_idx];
3480
3481         } else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
3482                 interrupt = &hdev->common_user_cq_interrupt;
3483         } else if (interrupt_id == HL_COMMON_DEC_INTERRUPT_ID) {
3484                 interrupt = &hdev->common_decoder_interrupt;
3485         } else {
3486                 dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
3487                 return -EINVAL;
3488         }
3489
3490         if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ)
3491                 rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->mem_mgr, &hpriv->mem_mgr,
3492                                 args->in.interrupt_timeout_us, args->in.cq_counters_handle,
3493                                 args->in.cq_counters_offset,
3494                                 args->in.target, interrupt,
3495                                 !!(args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT),
3496                                 args->in.timestamp_handle, args->in.timestamp_offset,
3497                                 &status, &timestamp);
3498         else
3499                 rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx,
3500                                 args->in.interrupt_timeout_us, args->in.addr,
3501                                 args->in.target, interrupt, &status,
3502                                 &timestamp);
3503         if (rc)
3504                 return rc;
3505
3506         memset(args, 0, sizeof(*args));
3507         args->out.status = status;
3508
3509         if (timestamp) {
3510                 args->out.timestamp_nsec = timestamp;
3511                 args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3512         }
3513
3514         return 0;
3515 }
3516
3517 int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3518 {
3519         struct hl_device *hdev = hpriv->hdev;
3520         union hl_wait_cs_args *args = data;
3521         u32 flags = args->in.flags;
3522         int rc;
3523
3524         /* If the device is not operational, or if an error has happened and user should release the
3525          * device, there is no point in waiting for any command submission or user interrupt.
3526          */
3527         if (!hl_device_operational(hpriv->hdev, NULL) || hdev->reset_info.watchdog_active)
3528                 return -EBUSY;
3529
3530         if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
3531                 rc = hl_interrupt_wait_ioctl(hpriv, data);
3532         else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
3533                 rc = hl_multi_cs_wait_ioctl(hpriv, data);
3534         else
3535                 rc = hl_cs_wait_ioctl(hpriv, data);
3536
3537         return rc;
3538 }