drm/i915/gt: Replace direct submit with direct call to tasklet
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / selftests / i915_request.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
31
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_clock_utils.h"
37 #include "gt/intel_gt_requests.h"
38 #include "gt/selftest_engine_heartbeat.h"
39
40 #include "i915_random.h"
41 #include "i915_selftest.h"
42 #include "igt_flush_test.h"
43 #include "igt_live_test.h"
44 #include "igt_spinner.h"
45 #include "lib_sw_fence.h"
46
47 #include "mock_drm.h"
48 #include "mock_gem_device.h"
49
50 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51 {
52         struct intel_engine_cs *engine;
53         unsigned int count;
54
55         count = 0;
56         for_each_uabi_engine(engine, i915)
57                 count++;
58
59         return count;
60 }
61
62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63 {
64         return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
65 }
66
67 static int igt_add_request(void *arg)
68 {
69         struct drm_i915_private *i915 = arg;
70         struct i915_request *request;
71
72         /* Basic preliminary test to create a request and let it loose! */
73
74         request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
75         if (!request)
76                 return -ENOMEM;
77
78         i915_request_add(request);
79
80         return 0;
81 }
82
83 static int igt_wait_request(void *arg)
84 {
85         const long T = HZ / 4;
86         struct drm_i915_private *i915 = arg;
87         struct i915_request *request;
88         int err = -EINVAL;
89
90         /* Submit a request, then wait upon it */
91
92         request = mock_request(rcs0(i915)->kernel_context, T);
93         if (!request)
94                 return -ENOMEM;
95
96         i915_request_get(request);
97
98         if (i915_request_wait(request, 0, 0) != -ETIME) {
99                 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
100                 goto out_request;
101         }
102
103         if (i915_request_wait(request, 0, T) != -ETIME) {
104                 pr_err("request wait succeeded (expected timeout before submit!)\n");
105                 goto out_request;
106         }
107
108         if (i915_request_completed(request)) {
109                 pr_err("request completed before submit!!\n");
110                 goto out_request;
111         }
112
113         i915_request_add(request);
114
115         if (i915_request_wait(request, 0, 0) != -ETIME) {
116                 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
117                 goto out_request;
118         }
119
120         if (i915_request_completed(request)) {
121                 pr_err("request completed immediately!\n");
122                 goto out_request;
123         }
124
125         if (i915_request_wait(request, 0, T / 2) != -ETIME) {
126                 pr_err("request wait succeeded (expected timeout!)\n");
127                 goto out_request;
128         }
129
130         if (i915_request_wait(request, 0, T) == -ETIME) {
131                 pr_err("request wait timed out!\n");
132                 goto out_request;
133         }
134
135         if (!i915_request_completed(request)) {
136                 pr_err("request not complete after waiting!\n");
137                 goto out_request;
138         }
139
140         if (i915_request_wait(request, 0, T) == -ETIME) {
141                 pr_err("request wait timed out when already complete!\n");
142                 goto out_request;
143         }
144
145         err = 0;
146 out_request:
147         i915_request_put(request);
148         mock_device_flush(i915);
149         return err;
150 }
151
152 static int igt_fence_wait(void *arg)
153 {
154         const long T = HZ / 4;
155         struct drm_i915_private *i915 = arg;
156         struct i915_request *request;
157         int err = -EINVAL;
158
159         /* Submit a request, treat it as a fence and wait upon it */
160
161         request = mock_request(rcs0(i915)->kernel_context, T);
162         if (!request)
163                 return -ENOMEM;
164
165         if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
166                 pr_err("fence wait success before submit (expected timeout)!\n");
167                 goto out;
168         }
169
170         i915_request_add(request);
171
172         if (dma_fence_is_signaled(&request->fence)) {
173                 pr_err("fence signaled immediately!\n");
174                 goto out;
175         }
176
177         if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
178                 pr_err("fence wait success after submit (expected timeout)!\n");
179                 goto out;
180         }
181
182         if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
183                 pr_err("fence wait timed out (expected success)!\n");
184                 goto out;
185         }
186
187         if (!dma_fence_is_signaled(&request->fence)) {
188                 pr_err("fence unsignaled after waiting!\n");
189                 goto out;
190         }
191
192         if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
193                 pr_err("fence wait timed out when complete (expected success)!\n");
194                 goto out;
195         }
196
197         err = 0;
198 out:
199         mock_device_flush(i915);
200         return err;
201 }
202
203 static int igt_request_rewind(void *arg)
204 {
205         struct drm_i915_private *i915 = arg;
206         struct i915_request *request, *vip;
207         struct i915_gem_context *ctx[2];
208         struct intel_context *ce;
209         int err = -EINVAL;
210
211         ctx[0] = mock_context(i915, "A");
212
213         ce = i915_gem_context_get_engine(ctx[0], RCS0);
214         GEM_BUG_ON(IS_ERR(ce));
215         request = mock_request(ce, 2 * HZ);
216         intel_context_put(ce);
217         if (!request) {
218                 err = -ENOMEM;
219                 goto err_context_0;
220         }
221
222         i915_request_get(request);
223         i915_request_add(request);
224
225         ctx[1] = mock_context(i915, "B");
226
227         ce = i915_gem_context_get_engine(ctx[1], RCS0);
228         GEM_BUG_ON(IS_ERR(ce));
229         vip = mock_request(ce, 0);
230         intel_context_put(ce);
231         if (!vip) {
232                 err = -ENOMEM;
233                 goto err_context_1;
234         }
235
236         /* Simulate preemption by manual reordering */
237         if (!mock_cancel_request(request)) {
238                 pr_err("failed to cancel request (already executed)!\n");
239                 i915_request_add(vip);
240                 goto err_context_1;
241         }
242         i915_request_get(vip);
243         i915_request_add(vip);
244         rcu_read_lock();
245         request->engine->submit_request(request);
246         rcu_read_unlock();
247
248
249         if (i915_request_wait(vip, 0, HZ) == -ETIME) {
250                 pr_err("timed out waiting for high priority request\n");
251                 goto err;
252         }
253
254         if (i915_request_completed(request)) {
255                 pr_err("low priority request already completed\n");
256                 goto err;
257         }
258
259         err = 0;
260 err:
261         i915_request_put(vip);
262 err_context_1:
263         mock_context_close(ctx[1]);
264         i915_request_put(request);
265 err_context_0:
266         mock_context_close(ctx[0]);
267         mock_device_flush(i915);
268         return err;
269 }
270
271 struct smoketest {
272         struct intel_engine_cs *engine;
273         struct i915_gem_context **contexts;
274         atomic_long_t num_waits, num_fences;
275         int ncontexts, max_batch;
276         struct i915_request *(*request_alloc)(struct intel_context *ce);
277 };
278
279 static struct i915_request *
280 __mock_request_alloc(struct intel_context *ce)
281 {
282         return mock_request(ce, 0);
283 }
284
285 static struct i915_request *
286 __live_request_alloc(struct intel_context *ce)
287 {
288         return intel_context_create_request(ce);
289 }
290
291 static int __igt_breadcrumbs_smoketest(void *arg)
292 {
293         struct smoketest *t = arg;
294         const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
295         const unsigned int total = 4 * t->ncontexts + 1;
296         unsigned int num_waits = 0, num_fences = 0;
297         struct i915_request **requests;
298         I915_RND_STATE(prng);
299         unsigned int *order;
300         int err = 0;
301
302         /*
303          * A very simple test to catch the most egregious of list handling bugs.
304          *
305          * At its heart, we simply create oodles of requests running across
306          * multiple kthreads and enable signaling on them, for the sole purpose
307          * of stressing our breadcrumb handling. The only inspection we do is
308          * that the fences were marked as signaled.
309          */
310
311         requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
312         if (!requests)
313                 return -ENOMEM;
314
315         order = i915_random_order(total, &prng);
316         if (!order) {
317                 err = -ENOMEM;
318                 goto out_requests;
319         }
320
321         while (!kthread_should_stop()) {
322                 struct i915_sw_fence *submit, *wait;
323                 unsigned int n, count;
324
325                 submit = heap_fence_create(GFP_KERNEL);
326                 if (!submit) {
327                         err = -ENOMEM;
328                         break;
329                 }
330
331                 wait = heap_fence_create(GFP_KERNEL);
332                 if (!wait) {
333                         i915_sw_fence_commit(submit);
334                         heap_fence_put(submit);
335                         err = -ENOMEM;
336                         break;
337                 }
338
339                 i915_random_reorder(order, total, &prng);
340                 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
341
342                 for (n = 0; n < count; n++) {
343                         struct i915_gem_context *ctx =
344                                 t->contexts[order[n] % t->ncontexts];
345                         struct i915_request *rq;
346                         struct intel_context *ce;
347
348                         ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
349                         GEM_BUG_ON(IS_ERR(ce));
350                         rq = t->request_alloc(ce);
351                         intel_context_put(ce);
352                         if (IS_ERR(rq)) {
353                                 err = PTR_ERR(rq);
354                                 count = n;
355                                 break;
356                         }
357
358                         err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
359                                                                submit,
360                                                                GFP_KERNEL);
361
362                         requests[n] = i915_request_get(rq);
363                         i915_request_add(rq);
364
365                         if (err >= 0)
366                                 err = i915_sw_fence_await_dma_fence(wait,
367                                                                     &rq->fence,
368                                                                     0,
369                                                                     GFP_KERNEL);
370
371                         if (err < 0) {
372                                 i915_request_put(rq);
373                                 count = n;
374                                 break;
375                         }
376                 }
377
378                 i915_sw_fence_commit(submit);
379                 i915_sw_fence_commit(wait);
380
381                 if (!wait_event_timeout(wait->wait,
382                                         i915_sw_fence_done(wait),
383                                         5 * HZ)) {
384                         struct i915_request *rq = requests[count - 1];
385
386                         pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
387                                atomic_read(&wait->pending), count,
388                                rq->fence.context, rq->fence.seqno,
389                                t->engine->name);
390                         GEM_TRACE_DUMP();
391
392                         intel_gt_set_wedged(t->engine->gt);
393                         GEM_BUG_ON(!i915_request_completed(rq));
394                         i915_sw_fence_wait(wait);
395                         err = -EIO;
396                 }
397
398                 for (n = 0; n < count; n++) {
399                         struct i915_request *rq = requests[n];
400
401                         if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
402                                       &rq->fence.flags)) {
403                                 pr_err("%llu:%llu was not signaled!\n",
404                                        rq->fence.context, rq->fence.seqno);
405                                 err = -EINVAL;
406                         }
407
408                         i915_request_put(rq);
409                 }
410
411                 heap_fence_put(wait);
412                 heap_fence_put(submit);
413
414                 if (err < 0)
415                         break;
416
417                 num_fences += count;
418                 num_waits++;
419
420                 cond_resched();
421         }
422
423         atomic_long_add(num_fences, &t->num_fences);
424         atomic_long_add(num_waits, &t->num_waits);
425
426         kfree(order);
427 out_requests:
428         kfree(requests);
429         return err;
430 }
431
432 static int mock_breadcrumbs_smoketest(void *arg)
433 {
434         struct drm_i915_private *i915 = arg;
435         struct smoketest t = {
436                 .engine = rcs0(i915),
437                 .ncontexts = 1024,
438                 .max_batch = 1024,
439                 .request_alloc = __mock_request_alloc
440         };
441         unsigned int ncpus = num_online_cpus();
442         struct task_struct **threads;
443         unsigned int n;
444         int ret = 0;
445
446         /*
447          * Smoketest our breadcrumb/signal handling for requests across multiple
448          * threads. A very simple test to only catch the most egregious of bugs.
449          * See __igt_breadcrumbs_smoketest();
450          */
451
452         threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
453         if (!threads)
454                 return -ENOMEM;
455
456         t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
457         if (!t.contexts) {
458                 ret = -ENOMEM;
459                 goto out_threads;
460         }
461
462         for (n = 0; n < t.ncontexts; n++) {
463                 t.contexts[n] = mock_context(t.engine->i915, "mock");
464                 if (!t.contexts[n]) {
465                         ret = -ENOMEM;
466                         goto out_contexts;
467                 }
468         }
469
470         for (n = 0; n < ncpus; n++) {
471                 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
472                                          &t, "igt/%d", n);
473                 if (IS_ERR(threads[n])) {
474                         ret = PTR_ERR(threads[n]);
475                         ncpus = n;
476                         break;
477                 }
478
479                 get_task_struct(threads[n]);
480         }
481
482         yield(); /* start all threads before we begin */
483         msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
484
485         for (n = 0; n < ncpus; n++) {
486                 int err;
487
488                 err = kthread_stop(threads[n]);
489                 if (err < 0 && !ret)
490                         ret = err;
491
492                 put_task_struct(threads[n]);
493         }
494         pr_info("Completed %lu waits for %lu fence across %d cpus\n",
495                 atomic_long_read(&t.num_waits),
496                 atomic_long_read(&t.num_fences),
497                 ncpus);
498
499 out_contexts:
500         for (n = 0; n < t.ncontexts; n++) {
501                 if (!t.contexts[n])
502                         break;
503                 mock_context_close(t.contexts[n]);
504         }
505         kfree(t.contexts);
506 out_threads:
507         kfree(threads);
508         return ret;
509 }
510
511 int i915_request_mock_selftests(void)
512 {
513         static const struct i915_subtest tests[] = {
514                 SUBTEST(igt_add_request),
515                 SUBTEST(igt_wait_request),
516                 SUBTEST(igt_fence_wait),
517                 SUBTEST(igt_request_rewind),
518                 SUBTEST(mock_breadcrumbs_smoketest),
519         };
520         struct drm_i915_private *i915;
521         intel_wakeref_t wakeref;
522         int err = 0;
523
524         i915 = mock_gem_device();
525         if (!i915)
526                 return -ENOMEM;
527
528         with_intel_runtime_pm(&i915->runtime_pm, wakeref)
529                 err = i915_subtests(tests, i915);
530
531         mock_destroy_device(i915);
532
533         return err;
534 }
535
536 static int live_nop_request(void *arg)
537 {
538         struct drm_i915_private *i915 = arg;
539         struct intel_engine_cs *engine;
540         struct igt_live_test t;
541         int err = -ENODEV;
542
543         /*
544          * Submit various sized batches of empty requests, to each engine
545          * (individually), and wait for the batch to complete. We can check
546          * the overhead of submitting requests to the hardware.
547          */
548
549         for_each_uabi_engine(engine, i915) {
550                 unsigned long n, prime;
551                 IGT_TIMEOUT(end_time);
552                 ktime_t times[2] = {};
553
554                 err = igt_live_test_begin(&t, i915, __func__, engine->name);
555                 if (err)
556                         return err;
557
558                 intel_engine_pm_get(engine);
559                 for_each_prime_number_from(prime, 1, 8192) {
560                         struct i915_request *request = NULL;
561
562                         times[1] = ktime_get_raw();
563
564                         for (n = 0; n < prime; n++) {
565                                 i915_request_put(request);
566                                 request = i915_request_create(engine->kernel_context);
567                                 if (IS_ERR(request))
568                                         return PTR_ERR(request);
569
570                                 /*
571                                  * This space is left intentionally blank.
572                                  *
573                                  * We do not actually want to perform any
574                                  * action with this request, we just want
575                                  * to measure the latency in allocation
576                                  * and submission of our breadcrumbs -
577                                  * ensuring that the bare request is sufficient
578                                  * for the system to work (i.e. proper HEAD
579                                  * tracking of the rings, interrupt handling,
580                                  * etc). It also gives us the lowest bounds
581                                  * for latency.
582                                  */
583
584                                 i915_request_get(request);
585                                 i915_request_add(request);
586                         }
587                         i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
588                         i915_request_put(request);
589
590                         times[1] = ktime_sub(ktime_get_raw(), times[1]);
591                         if (prime == 1)
592                                 times[0] = times[1];
593
594                         if (__igt_timeout(end_time, NULL))
595                                 break;
596                 }
597                 intel_engine_pm_put(engine);
598
599                 err = igt_live_test_end(&t);
600                 if (err)
601                         return err;
602
603                 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
604                         engine->name,
605                         ktime_to_ns(times[0]),
606                         prime, div64_u64(ktime_to_ns(times[1]), prime));
607         }
608
609         return err;
610 }
611
612 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
613 {
614         struct drm_i915_gem_object *obj;
615         struct i915_vma *vma;
616         u32 *cmd;
617         int err;
618
619         obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
620         if (IS_ERR(obj))
621                 return ERR_CAST(obj);
622
623         cmd = i915_gem_object_pin_map(obj, I915_MAP_WB);
624         if (IS_ERR(cmd)) {
625                 err = PTR_ERR(cmd);
626                 goto err;
627         }
628
629         *cmd = MI_BATCH_BUFFER_END;
630
631         __i915_gem_object_flush_map(obj, 0, 64);
632         i915_gem_object_unpin_map(obj);
633
634         intel_gt_chipset_flush(&i915->gt);
635
636         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
637         if (IS_ERR(vma)) {
638                 err = PTR_ERR(vma);
639                 goto err;
640         }
641
642         err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
643         if (err)
644                 goto err;
645
646         /* Force the wait wait now to avoid including it in the benchmark */
647         err = i915_vma_sync(vma);
648         if (err)
649                 goto err_pin;
650
651         return vma;
652
653 err_pin:
654         i915_vma_unpin(vma);
655 err:
656         i915_gem_object_put(obj);
657         return ERR_PTR(err);
658 }
659
660 static struct i915_request *
661 empty_request(struct intel_engine_cs *engine,
662               struct i915_vma *batch)
663 {
664         struct i915_request *request;
665         int err;
666
667         request = i915_request_create(engine->kernel_context);
668         if (IS_ERR(request))
669                 return request;
670
671         err = engine->emit_bb_start(request,
672                                     batch->node.start,
673                                     batch->node.size,
674                                     I915_DISPATCH_SECURE);
675         if (err)
676                 goto out_request;
677
678         i915_request_get(request);
679 out_request:
680         i915_request_add(request);
681         return err ? ERR_PTR(err) : request;
682 }
683
684 static int live_empty_request(void *arg)
685 {
686         struct drm_i915_private *i915 = arg;
687         struct intel_engine_cs *engine;
688         struct igt_live_test t;
689         struct i915_vma *batch;
690         int err = 0;
691
692         /*
693          * Submit various sized batches of empty requests, to each engine
694          * (individually), and wait for the batch to complete. We can check
695          * the overhead of submitting requests to the hardware.
696          */
697
698         batch = empty_batch(i915);
699         if (IS_ERR(batch))
700                 return PTR_ERR(batch);
701
702         for_each_uabi_engine(engine, i915) {
703                 IGT_TIMEOUT(end_time);
704                 struct i915_request *request;
705                 unsigned long n, prime;
706                 ktime_t times[2] = {};
707
708                 err = igt_live_test_begin(&t, i915, __func__, engine->name);
709                 if (err)
710                         goto out_batch;
711
712                 intel_engine_pm_get(engine);
713
714                 /* Warmup / preload */
715                 request = empty_request(engine, batch);
716                 if (IS_ERR(request)) {
717                         err = PTR_ERR(request);
718                         intel_engine_pm_put(engine);
719                         goto out_batch;
720                 }
721                 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
722
723                 for_each_prime_number_from(prime, 1, 8192) {
724                         times[1] = ktime_get_raw();
725
726                         for (n = 0; n < prime; n++) {
727                                 i915_request_put(request);
728                                 request = empty_request(engine, batch);
729                                 if (IS_ERR(request)) {
730                                         err = PTR_ERR(request);
731                                         intel_engine_pm_put(engine);
732                                         goto out_batch;
733                                 }
734                         }
735                         i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
736
737                         times[1] = ktime_sub(ktime_get_raw(), times[1]);
738                         if (prime == 1)
739                                 times[0] = times[1];
740
741                         if (__igt_timeout(end_time, NULL))
742                                 break;
743                 }
744                 i915_request_put(request);
745                 intel_engine_pm_put(engine);
746
747                 err = igt_live_test_end(&t);
748                 if (err)
749                         goto out_batch;
750
751                 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
752                         engine->name,
753                         ktime_to_ns(times[0]),
754                         prime, div64_u64(ktime_to_ns(times[1]), prime));
755         }
756
757 out_batch:
758         i915_vma_unpin(batch);
759         i915_vma_put(batch);
760         return err;
761 }
762
763 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
764 {
765         struct drm_i915_gem_object *obj;
766         const int gen = INTEL_GEN(i915);
767         struct i915_vma *vma;
768         u32 *cmd;
769         int err;
770
771         obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
772         if (IS_ERR(obj))
773                 return ERR_CAST(obj);
774
775         vma = i915_vma_instance(obj, i915->gt.vm, NULL);
776         if (IS_ERR(vma)) {
777                 err = PTR_ERR(vma);
778                 goto err;
779         }
780
781         err = i915_vma_pin(vma, 0, 0, PIN_USER);
782         if (err)
783                 goto err;
784
785         cmd = i915_gem_object_pin_map(obj, I915_MAP_WC);
786         if (IS_ERR(cmd)) {
787                 err = PTR_ERR(cmd);
788                 goto err;
789         }
790
791         if (gen >= 8) {
792                 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
793                 *cmd++ = lower_32_bits(vma->node.start);
794                 *cmd++ = upper_32_bits(vma->node.start);
795         } else if (gen >= 6) {
796                 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
797                 *cmd++ = lower_32_bits(vma->node.start);
798         } else {
799                 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
800                 *cmd++ = lower_32_bits(vma->node.start);
801         }
802         *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
803
804         __i915_gem_object_flush_map(obj, 0, 64);
805         i915_gem_object_unpin_map(obj);
806
807         intel_gt_chipset_flush(&i915->gt);
808
809         return vma;
810
811 err:
812         i915_gem_object_put(obj);
813         return ERR_PTR(err);
814 }
815
816 static int recursive_batch_resolve(struct i915_vma *batch)
817 {
818         u32 *cmd;
819
820         cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC);
821         if (IS_ERR(cmd))
822                 return PTR_ERR(cmd);
823
824         *cmd = MI_BATCH_BUFFER_END;
825
826         __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
827         i915_gem_object_unpin_map(batch->obj);
828
829         intel_gt_chipset_flush(batch->vm->gt);
830
831         return 0;
832 }
833
834 static int live_all_engines(void *arg)
835 {
836         struct drm_i915_private *i915 = arg;
837         const unsigned int nengines = num_uabi_engines(i915);
838         struct intel_engine_cs *engine;
839         struct i915_request **request;
840         struct igt_live_test t;
841         struct i915_vma *batch;
842         unsigned int idx;
843         int err;
844
845         /*
846          * Check we can submit requests to all engines simultaneously. We
847          * send a recursive batch to each engine - checking that we don't
848          * block doing so, and that they don't complete too soon.
849          */
850
851         request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
852         if (!request)
853                 return -ENOMEM;
854
855         err = igt_live_test_begin(&t, i915, __func__, "");
856         if (err)
857                 goto out_free;
858
859         batch = recursive_batch(i915);
860         if (IS_ERR(batch)) {
861                 err = PTR_ERR(batch);
862                 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
863                 goto out_free;
864         }
865
866         i915_vma_lock(batch);
867
868         idx = 0;
869         for_each_uabi_engine(engine, i915) {
870                 request[idx] = intel_engine_create_kernel_request(engine);
871                 if (IS_ERR(request[idx])) {
872                         err = PTR_ERR(request[idx]);
873                         pr_err("%s: Request allocation failed with err=%d\n",
874                                __func__, err);
875                         goto out_request;
876                 }
877
878                 err = i915_request_await_object(request[idx], batch->obj, 0);
879                 if (err == 0)
880                         err = i915_vma_move_to_active(batch, request[idx], 0);
881                 GEM_BUG_ON(err);
882
883                 err = engine->emit_bb_start(request[idx],
884                                             batch->node.start,
885                                             batch->node.size,
886                                             0);
887                 GEM_BUG_ON(err);
888                 request[idx]->batch = batch;
889
890                 i915_request_get(request[idx]);
891                 i915_request_add(request[idx]);
892                 idx++;
893         }
894
895         i915_vma_unlock(batch);
896
897         idx = 0;
898         for_each_uabi_engine(engine, i915) {
899                 if (i915_request_completed(request[idx])) {
900                         pr_err("%s(%s): request completed too early!\n",
901                                __func__, engine->name);
902                         err = -EINVAL;
903                         goto out_request;
904                 }
905                 idx++;
906         }
907
908         err = recursive_batch_resolve(batch);
909         if (err) {
910                 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
911                 goto out_request;
912         }
913
914         idx = 0;
915         for_each_uabi_engine(engine, i915) {
916                 long timeout;
917
918                 timeout = i915_request_wait(request[idx], 0,
919                                             MAX_SCHEDULE_TIMEOUT);
920                 if (timeout < 0) {
921                         err = timeout;
922                         pr_err("%s: error waiting for request on %s, err=%d\n",
923                                __func__, engine->name, err);
924                         goto out_request;
925                 }
926
927                 GEM_BUG_ON(!i915_request_completed(request[idx]));
928                 i915_request_put(request[idx]);
929                 request[idx] = NULL;
930                 idx++;
931         }
932
933         err = igt_live_test_end(&t);
934
935 out_request:
936         idx = 0;
937         for_each_uabi_engine(engine, i915) {
938                 if (request[idx])
939                         i915_request_put(request[idx]);
940                 idx++;
941         }
942         i915_vma_unpin(batch);
943         i915_vma_put(batch);
944 out_free:
945         kfree(request);
946         return err;
947 }
948
949 static int live_sequential_engines(void *arg)
950 {
951         struct drm_i915_private *i915 = arg;
952         const unsigned int nengines = num_uabi_engines(i915);
953         struct i915_request **request;
954         struct i915_request *prev = NULL;
955         struct intel_engine_cs *engine;
956         struct igt_live_test t;
957         unsigned int idx;
958         int err;
959
960         /*
961          * Check we can submit requests to all engines sequentially, such
962          * that each successive request waits for the earlier ones. This
963          * tests that we don't execute requests out of order, even though
964          * they are running on independent engines.
965          */
966
967         request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
968         if (!request)
969                 return -ENOMEM;
970
971         err = igt_live_test_begin(&t, i915, __func__, "");
972         if (err)
973                 goto out_free;
974
975         idx = 0;
976         for_each_uabi_engine(engine, i915) {
977                 struct i915_vma *batch;
978
979                 batch = recursive_batch(i915);
980                 if (IS_ERR(batch)) {
981                         err = PTR_ERR(batch);
982                         pr_err("%s: Unable to create batch for %s, err=%d\n",
983                                __func__, engine->name, err);
984                         goto out_free;
985                 }
986
987                 i915_vma_lock(batch);
988                 request[idx] = intel_engine_create_kernel_request(engine);
989                 if (IS_ERR(request[idx])) {
990                         err = PTR_ERR(request[idx]);
991                         pr_err("%s: Request allocation failed for %s with err=%d\n",
992                                __func__, engine->name, err);
993                         goto out_unlock;
994                 }
995
996                 if (prev) {
997                         err = i915_request_await_dma_fence(request[idx],
998                                                            &prev->fence);
999                         if (err) {
1000                                 i915_request_add(request[idx]);
1001                                 pr_err("%s: Request await failed for %s with err=%d\n",
1002                                        __func__, engine->name, err);
1003                                 goto out_unlock;
1004                         }
1005                 }
1006
1007                 err = i915_request_await_object(request[idx],
1008                                                 batch->obj, false);
1009                 if (err == 0)
1010                         err = i915_vma_move_to_active(batch, request[idx], 0);
1011                 GEM_BUG_ON(err);
1012
1013                 err = engine->emit_bb_start(request[idx],
1014                                             batch->node.start,
1015                                             batch->node.size,
1016                                             0);
1017                 GEM_BUG_ON(err);
1018                 request[idx]->batch = batch;
1019
1020                 i915_request_get(request[idx]);
1021                 i915_request_add(request[idx]);
1022
1023                 prev = request[idx];
1024                 idx++;
1025
1026 out_unlock:
1027                 i915_vma_unlock(batch);
1028                 if (err)
1029                         goto out_request;
1030         }
1031
1032         idx = 0;
1033         for_each_uabi_engine(engine, i915) {
1034                 long timeout;
1035
1036                 if (i915_request_completed(request[idx])) {
1037                         pr_err("%s(%s): request completed too early!\n",
1038                                __func__, engine->name);
1039                         err = -EINVAL;
1040                         goto out_request;
1041                 }
1042
1043                 err = recursive_batch_resolve(request[idx]->batch);
1044                 if (err) {
1045                         pr_err("%s: failed to resolve batch, err=%d\n",
1046                                __func__, err);
1047                         goto out_request;
1048                 }
1049
1050                 timeout = i915_request_wait(request[idx], 0,
1051                                             MAX_SCHEDULE_TIMEOUT);
1052                 if (timeout < 0) {
1053                         err = timeout;
1054                         pr_err("%s: error waiting for request on %s, err=%d\n",
1055                                __func__, engine->name, err);
1056                         goto out_request;
1057                 }
1058
1059                 GEM_BUG_ON(!i915_request_completed(request[idx]));
1060                 idx++;
1061         }
1062
1063         err = igt_live_test_end(&t);
1064
1065 out_request:
1066         idx = 0;
1067         for_each_uabi_engine(engine, i915) {
1068                 u32 *cmd;
1069
1070                 if (!request[idx])
1071                         break;
1072
1073                 cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
1074                                               I915_MAP_WC);
1075                 if (!IS_ERR(cmd)) {
1076                         *cmd = MI_BATCH_BUFFER_END;
1077
1078                         __i915_gem_object_flush_map(request[idx]->batch->obj,
1079                                                     0, sizeof(*cmd));
1080                         i915_gem_object_unpin_map(request[idx]->batch->obj);
1081
1082                         intel_gt_chipset_flush(engine->gt);
1083                 }
1084
1085                 i915_vma_put(request[idx]->batch);
1086                 i915_request_put(request[idx]);
1087                 idx++;
1088         }
1089 out_free:
1090         kfree(request);
1091         return err;
1092 }
1093
1094 static int __live_parallel_engine1(void *arg)
1095 {
1096         struct intel_engine_cs *engine = arg;
1097         IGT_TIMEOUT(end_time);
1098         unsigned long count;
1099         int err = 0;
1100
1101         count = 0;
1102         intel_engine_pm_get(engine);
1103         do {
1104                 struct i915_request *rq;
1105
1106                 rq = i915_request_create(engine->kernel_context);
1107                 if (IS_ERR(rq)) {
1108                         err = PTR_ERR(rq);
1109                         break;
1110                 }
1111
1112                 i915_request_get(rq);
1113                 i915_request_add(rq);
1114
1115                 err = 0;
1116                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1117                         err = -ETIME;
1118                 i915_request_put(rq);
1119                 if (err)
1120                         break;
1121
1122                 count++;
1123         } while (!__igt_timeout(end_time, NULL));
1124         intel_engine_pm_put(engine);
1125
1126         pr_info("%s: %lu request + sync\n", engine->name, count);
1127         return err;
1128 }
1129
1130 static int __live_parallel_engineN(void *arg)
1131 {
1132         struct intel_engine_cs *engine = arg;
1133         IGT_TIMEOUT(end_time);
1134         unsigned long count;
1135         int err = 0;
1136
1137         count = 0;
1138         intel_engine_pm_get(engine);
1139         do {
1140                 struct i915_request *rq;
1141
1142                 rq = i915_request_create(engine->kernel_context);
1143                 if (IS_ERR(rq)) {
1144                         err = PTR_ERR(rq);
1145                         break;
1146                 }
1147
1148                 i915_request_add(rq);
1149                 count++;
1150         } while (!__igt_timeout(end_time, NULL));
1151         intel_engine_pm_put(engine);
1152
1153         pr_info("%s: %lu requests\n", engine->name, count);
1154         return err;
1155 }
1156
1157 static bool wake_all(struct drm_i915_private *i915)
1158 {
1159         if (atomic_dec_and_test(&i915->selftest.counter)) {
1160                 wake_up_var(&i915->selftest.counter);
1161                 return true;
1162         }
1163
1164         return false;
1165 }
1166
1167 static int wait_for_all(struct drm_i915_private *i915)
1168 {
1169         if (wake_all(i915))
1170                 return 0;
1171
1172         if (wait_var_event_timeout(&i915->selftest.counter,
1173                                    !atomic_read(&i915->selftest.counter),
1174                                    i915_selftest.timeout_jiffies))
1175                 return 0;
1176
1177         return -ETIME;
1178 }
1179
1180 static int __live_parallel_spin(void *arg)
1181 {
1182         struct intel_engine_cs *engine = arg;
1183         struct igt_spinner spin;
1184         struct i915_request *rq;
1185         int err = 0;
1186
1187         /*
1188          * Create a spinner running for eternity on each engine. If a second
1189          * spinner is incorrectly placed on the same engine, it will not be
1190          * able to start in time.
1191          */
1192
1193         if (igt_spinner_init(&spin, engine->gt)) {
1194                 wake_all(engine->i915);
1195                 return -ENOMEM;
1196         }
1197
1198         intel_engine_pm_get(engine);
1199         rq = igt_spinner_create_request(&spin,
1200                                         engine->kernel_context,
1201                                         MI_NOOP); /* no preemption */
1202         intel_engine_pm_put(engine);
1203         if (IS_ERR(rq)) {
1204                 err = PTR_ERR(rq);
1205                 if (err == -ENODEV)
1206                         err = 0;
1207                 wake_all(engine->i915);
1208                 goto out_spin;
1209         }
1210
1211         i915_request_get(rq);
1212         i915_request_add(rq);
1213         if (igt_wait_for_spinner(&spin, rq)) {
1214                 /* Occupy this engine for the whole test */
1215                 err = wait_for_all(engine->i915);
1216         } else {
1217                 pr_err("Failed to start spinner on %s\n", engine->name);
1218                 err = -EINVAL;
1219         }
1220         igt_spinner_end(&spin);
1221
1222         if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1223                 err = -EIO;
1224         i915_request_put(rq);
1225
1226 out_spin:
1227         igt_spinner_fini(&spin);
1228         return err;
1229 }
1230
1231 static int live_parallel_engines(void *arg)
1232 {
1233         struct drm_i915_private *i915 = arg;
1234         static int (* const func[])(void *arg) = {
1235                 __live_parallel_engine1,
1236                 __live_parallel_engineN,
1237                 __live_parallel_spin,
1238                 NULL,
1239         };
1240         const unsigned int nengines = num_uabi_engines(i915);
1241         struct intel_engine_cs *engine;
1242         int (* const *fn)(void *arg);
1243         struct task_struct **tsk;
1244         int err = 0;
1245
1246         /*
1247          * Check we can submit requests to all engines concurrently. This
1248          * tests that we load up the system maximally.
1249          */
1250
1251         tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1252         if (!tsk)
1253                 return -ENOMEM;
1254
1255         for (fn = func; !err && *fn; fn++) {
1256                 char name[KSYM_NAME_LEN];
1257                 struct igt_live_test t;
1258                 unsigned int idx;
1259
1260                 snprintf(name, sizeof(name), "%ps", *fn);
1261                 err = igt_live_test_begin(&t, i915, __func__, name);
1262                 if (err)
1263                         break;
1264
1265                 atomic_set(&i915->selftest.counter, nengines);
1266
1267                 idx = 0;
1268                 for_each_uabi_engine(engine, i915) {
1269                         tsk[idx] = kthread_run(*fn, engine,
1270                                                "igt/parallel:%s",
1271                                                engine->name);
1272                         if (IS_ERR(tsk[idx])) {
1273                                 err = PTR_ERR(tsk[idx]);
1274                                 break;
1275                         }
1276                         get_task_struct(tsk[idx++]);
1277                 }
1278
1279                 yield(); /* start all threads before we kthread_stop() */
1280
1281                 idx = 0;
1282                 for_each_uabi_engine(engine, i915) {
1283                         int status;
1284
1285                         if (IS_ERR(tsk[idx]))
1286                                 break;
1287
1288                         status = kthread_stop(tsk[idx]);
1289                         if (status && !err)
1290                                 err = status;
1291
1292                         put_task_struct(tsk[idx++]);
1293                 }
1294
1295                 if (igt_live_test_end(&t))
1296                         err = -EIO;
1297         }
1298
1299         kfree(tsk);
1300         return err;
1301 }
1302
1303 static int
1304 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1305 {
1306         struct i915_request *rq;
1307         int ret;
1308
1309         /*
1310          * Before execlists, all contexts share the same ringbuffer. With
1311          * execlists, each context/engine has a separate ringbuffer and
1312          * for the purposes of this test, inexhaustible.
1313          *
1314          * For the global ringbuffer though, we have to be very careful
1315          * that we do not wrap while preventing the execution of requests
1316          * with a unsignaled fence.
1317          */
1318         if (HAS_EXECLISTS(ctx->i915))
1319                 return INT_MAX;
1320
1321         rq = igt_request_alloc(ctx, engine);
1322         if (IS_ERR(rq)) {
1323                 ret = PTR_ERR(rq);
1324         } else {
1325                 int sz;
1326
1327                 ret = rq->ring->size - rq->reserved_space;
1328                 i915_request_add(rq);
1329
1330                 sz = rq->ring->emit - rq->head;
1331                 if (sz < 0)
1332                         sz += rq->ring->size;
1333                 ret /= sz;
1334                 ret /= 2; /* leave half spare, in case of emergency! */
1335         }
1336
1337         return ret;
1338 }
1339
1340 static int live_breadcrumbs_smoketest(void *arg)
1341 {
1342         struct drm_i915_private *i915 = arg;
1343         const unsigned int nengines = num_uabi_engines(i915);
1344         const unsigned int ncpus = num_online_cpus();
1345         unsigned long num_waits, num_fences;
1346         struct intel_engine_cs *engine;
1347         struct task_struct **threads;
1348         struct igt_live_test live;
1349         intel_wakeref_t wakeref;
1350         struct smoketest *smoke;
1351         unsigned int n, idx;
1352         struct file *file;
1353         int ret = 0;
1354
1355         /*
1356          * Smoketest our breadcrumb/signal handling for requests across multiple
1357          * threads. A very simple test to only catch the most egregious of bugs.
1358          * See __igt_breadcrumbs_smoketest();
1359          *
1360          * On real hardware this time.
1361          */
1362
1363         wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1364
1365         file = mock_file(i915);
1366         if (IS_ERR(file)) {
1367                 ret = PTR_ERR(file);
1368                 goto out_rpm;
1369         }
1370
1371         smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1372         if (!smoke) {
1373                 ret = -ENOMEM;
1374                 goto out_file;
1375         }
1376
1377         threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1378         if (!threads) {
1379                 ret = -ENOMEM;
1380                 goto out_smoke;
1381         }
1382
1383         smoke[0].request_alloc = __live_request_alloc;
1384         smoke[0].ncontexts = 64;
1385         smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1386                                     sizeof(*smoke[0].contexts),
1387                                     GFP_KERNEL);
1388         if (!smoke[0].contexts) {
1389                 ret = -ENOMEM;
1390                 goto out_threads;
1391         }
1392
1393         for (n = 0; n < smoke[0].ncontexts; n++) {
1394                 smoke[0].contexts[n] = live_context(i915, file);
1395                 if (!smoke[0].contexts[n]) {
1396                         ret = -ENOMEM;
1397                         goto out_contexts;
1398                 }
1399         }
1400
1401         ret = igt_live_test_begin(&live, i915, __func__, "");
1402         if (ret)
1403                 goto out_contexts;
1404
1405         idx = 0;
1406         for_each_uabi_engine(engine, i915) {
1407                 smoke[idx] = smoke[0];
1408                 smoke[idx].engine = engine;
1409                 smoke[idx].max_batch =
1410                         max_batches(smoke[0].contexts[0], engine);
1411                 if (smoke[idx].max_batch < 0) {
1412                         ret = smoke[idx].max_batch;
1413                         goto out_flush;
1414                 }
1415                 /* One ring interleaved between requests from all cpus */
1416                 smoke[idx].max_batch /= num_online_cpus() + 1;
1417                 pr_debug("Limiting batches to %d requests on %s\n",
1418                          smoke[idx].max_batch, engine->name);
1419
1420                 for (n = 0; n < ncpus; n++) {
1421                         struct task_struct *tsk;
1422
1423                         tsk = kthread_run(__igt_breadcrumbs_smoketest,
1424                                           &smoke[idx], "igt/%d.%d", idx, n);
1425                         if (IS_ERR(tsk)) {
1426                                 ret = PTR_ERR(tsk);
1427                                 goto out_flush;
1428                         }
1429
1430                         get_task_struct(tsk);
1431                         threads[idx * ncpus + n] = tsk;
1432                 }
1433
1434                 idx++;
1435         }
1436
1437         yield(); /* start all threads before we begin */
1438         msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1439
1440 out_flush:
1441         idx = 0;
1442         num_waits = 0;
1443         num_fences = 0;
1444         for_each_uabi_engine(engine, i915) {
1445                 for (n = 0; n < ncpus; n++) {
1446                         struct task_struct *tsk = threads[idx * ncpus + n];
1447                         int err;
1448
1449                         if (!tsk)
1450                                 continue;
1451
1452                         err = kthread_stop(tsk);
1453                         if (err < 0 && !ret)
1454                                 ret = err;
1455
1456                         put_task_struct(tsk);
1457                 }
1458
1459                 num_waits += atomic_long_read(&smoke[idx].num_waits);
1460                 num_fences += atomic_long_read(&smoke[idx].num_fences);
1461                 idx++;
1462         }
1463         pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1464                 num_waits, num_fences, idx, ncpus);
1465
1466         ret = igt_live_test_end(&live) ?: ret;
1467 out_contexts:
1468         kfree(smoke[0].contexts);
1469 out_threads:
1470         kfree(threads);
1471 out_smoke:
1472         kfree(smoke);
1473 out_file:
1474         fput(file);
1475 out_rpm:
1476         intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1477
1478         return ret;
1479 }
1480
1481 int i915_request_live_selftests(struct drm_i915_private *i915)
1482 {
1483         static const struct i915_subtest tests[] = {
1484                 SUBTEST(live_nop_request),
1485                 SUBTEST(live_all_engines),
1486                 SUBTEST(live_sequential_engines),
1487                 SUBTEST(live_parallel_engines),
1488                 SUBTEST(live_empty_request),
1489                 SUBTEST(live_breadcrumbs_smoketest),
1490         };
1491
1492         if (intel_gt_is_wedged(&i915->gt))
1493                 return 0;
1494
1495         return i915_subtests(tests, i915);
1496 }
1497
1498 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1499 {
1500         struct i915_request *rq;
1501         struct dma_fence *fence;
1502
1503         rq = intel_engine_create_kernel_request(ce->engine);
1504         if (IS_ERR(rq))
1505                 return PTR_ERR(rq);
1506
1507         fence = i915_active_fence_get(&ce->timeline->last_request);
1508         if (fence) {
1509                 i915_request_await_dma_fence(rq, fence);
1510                 dma_fence_put(fence);
1511         }
1512
1513         rq = i915_request_get(rq);
1514         i915_request_add(rq);
1515         if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1516                 err = -ETIME;
1517         i915_request_put(rq);
1518
1519         while (!err && !intel_engine_is_idle(ce->engine))
1520                 intel_engine_flush_submission(ce->engine);
1521
1522         return err;
1523 }
1524
1525 struct perf_stats {
1526         struct intel_engine_cs *engine;
1527         unsigned long count;
1528         ktime_t time;
1529         ktime_t busy;
1530         u64 runtime;
1531 };
1532
1533 struct perf_series {
1534         struct drm_i915_private *i915;
1535         unsigned int nengines;
1536         struct intel_context *ce[];
1537 };
1538
1539 static int cmp_u32(const void *A, const void *B)
1540 {
1541         const u32 *a = A, *b = B;
1542
1543         return *a - *b;
1544 }
1545
1546 static u32 trifilter(u32 *a)
1547 {
1548         u64 sum;
1549
1550 #define TF_COUNT 5
1551         sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1552
1553         sum = mul_u32_u32(a[2], 2);
1554         sum += a[1];
1555         sum += a[3];
1556
1557         GEM_BUG_ON(sum > U32_MAX);
1558         return sum;
1559 #define TF_BIAS 2
1560 }
1561
1562 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1563 {
1564         u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1565
1566         return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1567 }
1568
1569 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1570 {
1571         *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1572         *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1573         *cs++ = offset;
1574         *cs++ = 0;
1575
1576         return cs;
1577 }
1578
1579 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1580 {
1581         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1582         *cs++ = offset;
1583         *cs++ = 0;
1584         *cs++ = value;
1585
1586         return cs;
1587 }
1588
1589 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1590 {
1591         *cs++ = MI_SEMAPHORE_WAIT |
1592                 MI_SEMAPHORE_GLOBAL_GTT |
1593                 MI_SEMAPHORE_POLL |
1594                 mode;
1595         *cs++ = value;
1596         *cs++ = offset;
1597         *cs++ = 0;
1598
1599         return cs;
1600 }
1601
1602 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1603 {
1604         return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1605 }
1606
1607 static void semaphore_set(u32 *sema, u32 value)
1608 {
1609         WRITE_ONCE(*sema, value);
1610         wmb(); /* flush the update to the cache, and beyond */
1611 }
1612
1613 static u32 *hwsp_scratch(const struct intel_context *ce)
1614 {
1615         return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1616 }
1617
1618 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1619 {
1620         return (i915_ggtt_offset(ce->engine->status_page.vma) +
1621                 offset_in_page(dw));
1622 }
1623
1624 static int measure_semaphore_response(struct intel_context *ce)
1625 {
1626         u32 *sema = hwsp_scratch(ce);
1627         const u32 offset = hwsp_offset(ce, sema);
1628         u32 elapsed[TF_COUNT], cycles;
1629         struct i915_request *rq;
1630         u32 *cs;
1631         int err;
1632         int i;
1633
1634         /*
1635          * Measure how many cycles it takes for the HW to detect the change
1636          * in a semaphore value.
1637          *
1638          *    A: read CS_TIMESTAMP from CPU
1639          *    poke semaphore
1640          *    B: read CS_TIMESTAMP on GPU
1641          *
1642          * Semaphore latency: B - A
1643          */
1644
1645         semaphore_set(sema, -1);
1646
1647         rq = i915_request_create(ce);
1648         if (IS_ERR(rq))
1649                 return PTR_ERR(rq);
1650
1651         cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1652         if (IS_ERR(cs)) {
1653                 i915_request_add(rq);
1654                 err = PTR_ERR(cs);
1655                 goto err;
1656         }
1657
1658         cs = emit_store_dw(cs, offset, 0);
1659         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1660                 cs = emit_semaphore_poll_until(cs, offset, i);
1661                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1662                 cs = emit_store_dw(cs, offset, 0);
1663         }
1664
1665         intel_ring_advance(rq, cs);
1666         i915_request_add(rq);
1667
1668         if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1669                 err = -EIO;
1670                 goto err;
1671         }
1672
1673         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1674                 preempt_disable();
1675                 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1676                 semaphore_set(sema, i);
1677                 preempt_enable();
1678
1679                 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1680                         err = -EIO;
1681                         goto err;
1682                 }
1683
1684                 elapsed[i - 1] = sema[i] - cycles;
1685         }
1686
1687         cycles = trifilter(elapsed);
1688         pr_info("%s: semaphore response %d cycles, %lluns\n",
1689                 ce->engine->name, cycles >> TF_BIAS,
1690                 cycles_to_ns(ce->engine, cycles));
1691
1692         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1693
1694 err:
1695         intel_gt_set_wedged(ce->engine->gt);
1696         return err;
1697 }
1698
1699 static int measure_idle_dispatch(struct intel_context *ce)
1700 {
1701         u32 *sema = hwsp_scratch(ce);
1702         const u32 offset = hwsp_offset(ce, sema);
1703         u32 elapsed[TF_COUNT], cycles;
1704         u32 *cs;
1705         int err;
1706         int i;
1707
1708         /*
1709          * Measure how long it takes for us to submit a request while the
1710          * engine is idle, but is resting in our context.
1711          *
1712          *    A: read CS_TIMESTAMP from CPU
1713          *    submit request
1714          *    B: read CS_TIMESTAMP on GPU
1715          *
1716          * Submission latency: B - A
1717          */
1718
1719         for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1720                 struct i915_request *rq;
1721
1722                 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1723                 if (err)
1724                         return err;
1725
1726                 rq = i915_request_create(ce);
1727                 if (IS_ERR(rq)) {
1728                         err = PTR_ERR(rq);
1729                         goto err;
1730                 }
1731
1732                 cs = intel_ring_begin(rq, 4);
1733                 if (IS_ERR(cs)) {
1734                         i915_request_add(rq);
1735                         err = PTR_ERR(cs);
1736                         goto err;
1737                 }
1738
1739                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1740
1741                 intel_ring_advance(rq, cs);
1742
1743                 preempt_disable();
1744                 local_bh_disable();
1745                 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1746                 i915_request_add(rq);
1747                 local_bh_enable();
1748                 preempt_enable();
1749         }
1750
1751         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1752         if (err)
1753                 goto err;
1754
1755         for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1756                 elapsed[i] = sema[i] - elapsed[i];
1757
1758         cycles = trifilter(elapsed);
1759         pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1760                 ce->engine->name, cycles >> TF_BIAS,
1761                 cycles_to_ns(ce->engine, cycles));
1762
1763         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1764
1765 err:
1766         intel_gt_set_wedged(ce->engine->gt);
1767         return err;
1768 }
1769
1770 static int measure_busy_dispatch(struct intel_context *ce)
1771 {
1772         u32 *sema = hwsp_scratch(ce);
1773         const u32 offset = hwsp_offset(ce, sema);
1774         u32 elapsed[TF_COUNT + 1], cycles;
1775         u32 *cs;
1776         int err;
1777         int i;
1778
1779         /*
1780          * Measure how long it takes for us to submit a request while the
1781          * engine is busy, polling on a semaphore in our context. With
1782          * direct submission, this will include the cost of a lite restore.
1783          *
1784          *    A: read CS_TIMESTAMP from CPU
1785          *    submit request
1786          *    B: read CS_TIMESTAMP on GPU
1787          *
1788          * Submission latency: B - A
1789          */
1790
1791         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1792                 struct i915_request *rq;
1793
1794                 rq = i915_request_create(ce);
1795                 if (IS_ERR(rq)) {
1796                         err = PTR_ERR(rq);
1797                         goto err;
1798                 }
1799
1800                 cs = intel_ring_begin(rq, 12);
1801                 if (IS_ERR(cs)) {
1802                         i915_request_add(rq);
1803                         err = PTR_ERR(cs);
1804                         goto err;
1805                 }
1806
1807                 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1808                 cs = emit_semaphore_poll_until(cs, offset, i);
1809                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1810
1811                 intel_ring_advance(rq, cs);
1812
1813                 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1814                         err = -EIO;
1815                         goto err;
1816                 }
1817
1818                 preempt_disable();
1819                 local_bh_disable();
1820                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1821                 i915_request_add(rq);
1822                 local_bh_enable();
1823                 semaphore_set(sema, i - 1);
1824                 preempt_enable();
1825         }
1826
1827         wait_for(READ_ONCE(sema[i - 1]), 500);
1828         semaphore_set(sema, i - 1);
1829
1830         for (i = 1; i <= TF_COUNT; i++) {
1831                 GEM_BUG_ON(sema[i] == -1);
1832                 elapsed[i - 1] = sema[i] - elapsed[i];
1833         }
1834
1835         cycles = trifilter(elapsed);
1836         pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1837                 ce->engine->name, cycles >> TF_BIAS,
1838                 cycles_to_ns(ce->engine, cycles));
1839
1840         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1841
1842 err:
1843         intel_gt_set_wedged(ce->engine->gt);
1844         return err;
1845 }
1846
1847 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1848 {
1849         const u32 offset =
1850                 i915_ggtt_offset(engine->status_page.vma) +
1851                 offset_in_page(sema);
1852         struct i915_request *rq;
1853         u32 *cs;
1854
1855         rq = i915_request_create(engine->kernel_context);
1856         if (IS_ERR(rq))
1857                 return PTR_ERR(rq);
1858
1859         cs = intel_ring_begin(rq, 4);
1860         if (IS_ERR(cs)) {
1861                 i915_request_add(rq);
1862                 return PTR_ERR(cs);
1863         }
1864
1865         cs = emit_semaphore_poll(cs, mode, value, offset);
1866
1867         intel_ring_advance(rq, cs);
1868         i915_request_add(rq);
1869
1870         return 0;
1871 }
1872
1873 static int measure_inter_request(struct intel_context *ce)
1874 {
1875         u32 *sema = hwsp_scratch(ce);
1876         const u32 offset = hwsp_offset(ce, sema);
1877         u32 elapsed[TF_COUNT + 1], cycles;
1878         struct i915_sw_fence *submit;
1879         int i, err;
1880
1881         /*
1882          * Measure how long it takes to advance from one request into the
1883          * next. Between each request we flush the GPU caches to memory,
1884          * update the breadcrumbs, and then invalidate those caches.
1885          * We queue up all the requests to be submitted in one batch so
1886          * it should be one set of contiguous measurements.
1887          *
1888          *    A: read CS_TIMESTAMP on GPU
1889          *    advance request
1890          *    B: read CS_TIMESTAMP on GPU
1891          *
1892          * Request latency: B - A
1893          */
1894
1895         err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1896         if (err)
1897                 return err;
1898
1899         submit = heap_fence_create(GFP_KERNEL);
1900         if (!submit) {
1901                 semaphore_set(sema, 1);
1902                 return -ENOMEM;
1903         }
1904
1905         intel_engine_flush_submission(ce->engine);
1906         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1907                 struct i915_request *rq;
1908                 u32 *cs;
1909
1910                 rq = i915_request_create(ce);
1911                 if (IS_ERR(rq)) {
1912                         err = PTR_ERR(rq);
1913                         goto err_submit;
1914                 }
1915
1916                 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1917                                                        submit,
1918                                                        GFP_KERNEL);
1919                 if (err < 0) {
1920                         i915_request_add(rq);
1921                         goto err_submit;
1922                 }
1923
1924                 cs = intel_ring_begin(rq, 4);
1925                 if (IS_ERR(cs)) {
1926                         i915_request_add(rq);
1927                         err = PTR_ERR(cs);
1928                         goto err_submit;
1929                 }
1930
1931                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1932
1933                 intel_ring_advance(rq, cs);
1934                 i915_request_add(rq);
1935         }
1936         i915_sw_fence_commit(submit);
1937         intel_engine_flush_submission(ce->engine);
1938         heap_fence_put(submit);
1939
1940         semaphore_set(sema, 1);
1941         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1942         if (err)
1943                 goto err;
1944
1945         for (i = 1; i <= TF_COUNT; i++)
1946                 elapsed[i - 1] = sema[i + 1] - sema[i];
1947
1948         cycles = trifilter(elapsed);
1949         pr_info("%s: inter-request latency %d cycles, %lluns\n",
1950                 ce->engine->name, cycles >> TF_BIAS,
1951                 cycles_to_ns(ce->engine, cycles));
1952
1953         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1954
1955 err_submit:
1956         i915_sw_fence_commit(submit);
1957         heap_fence_put(submit);
1958         semaphore_set(sema, 1);
1959 err:
1960         intel_gt_set_wedged(ce->engine->gt);
1961         return err;
1962 }
1963
1964 static int measure_context_switch(struct intel_context *ce)
1965 {
1966         u32 *sema = hwsp_scratch(ce);
1967         const u32 offset = hwsp_offset(ce, sema);
1968         struct i915_request *fence = NULL;
1969         u32 elapsed[TF_COUNT + 1], cycles;
1970         int i, j, err;
1971         u32 *cs;
1972
1973         /*
1974          * Measure how long it takes to advance from one request in one
1975          * context to a request in another context. This allows us to
1976          * measure how long the context save/restore take, along with all
1977          * the inter-context setup we require.
1978          *
1979          *    A: read CS_TIMESTAMP on GPU
1980          *    switch context
1981          *    B: read CS_TIMESTAMP on GPU
1982          *
1983          * Context switch latency: B - A
1984          */
1985
1986         err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1987         if (err)
1988                 return err;
1989
1990         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1991                 struct intel_context *arr[] = {
1992                         ce, ce->engine->kernel_context
1993                 };
1994                 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1995
1996                 for (j = 0; j < ARRAY_SIZE(arr); j++) {
1997                         struct i915_request *rq;
1998
1999                         rq = i915_request_create(arr[j]);
2000                         if (IS_ERR(rq)) {
2001                                 err = PTR_ERR(rq);
2002                                 goto err_fence;
2003                         }
2004
2005                         if (fence) {
2006                                 err = i915_request_await_dma_fence(rq,
2007                                                                    &fence->fence);
2008                                 if (err) {
2009                                         i915_request_add(rq);
2010                                         goto err_fence;
2011                                 }
2012                         }
2013
2014                         cs = intel_ring_begin(rq, 4);
2015                         if (IS_ERR(cs)) {
2016                                 i915_request_add(rq);
2017                                 err = PTR_ERR(cs);
2018                                 goto err_fence;
2019                         }
2020
2021                         cs = emit_timestamp_store(cs, ce, addr);
2022                         addr += sizeof(u32);
2023
2024                         intel_ring_advance(rq, cs);
2025
2026                         i915_request_put(fence);
2027                         fence = i915_request_get(rq);
2028
2029                         i915_request_add(rq);
2030                 }
2031         }
2032         i915_request_put(fence);
2033         intel_engine_flush_submission(ce->engine);
2034
2035         semaphore_set(sema, 1);
2036         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2037         if (err)
2038                 goto err;
2039
2040         for (i = 1; i <= TF_COUNT; i++)
2041                 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2042
2043         cycles = trifilter(elapsed);
2044         pr_info("%s: context switch latency %d cycles, %lluns\n",
2045                 ce->engine->name, cycles >> TF_BIAS,
2046                 cycles_to_ns(ce->engine, cycles));
2047
2048         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2049
2050 err_fence:
2051         i915_request_put(fence);
2052         semaphore_set(sema, 1);
2053 err:
2054         intel_gt_set_wedged(ce->engine->gt);
2055         return err;
2056 }
2057
2058 static int measure_preemption(struct intel_context *ce)
2059 {
2060         u32 *sema = hwsp_scratch(ce);
2061         const u32 offset = hwsp_offset(ce, sema);
2062         u32 elapsed[TF_COUNT], cycles;
2063         u32 *cs;
2064         int err;
2065         int i;
2066
2067         /*
2068          * We measure two latencies while triggering preemption. The first
2069          * latency is how long it takes for us to submit a preempting request.
2070          * The second latency is how it takes for us to return from the
2071          * preemption back to the original context.
2072          *
2073          *    A: read CS_TIMESTAMP from CPU
2074          *    submit preemption
2075          *    B: read CS_TIMESTAMP on GPU (in preempting context)
2076          *    context switch
2077          *    C: read CS_TIMESTAMP on GPU (in original context)
2078          *
2079          * Preemption dispatch latency: B - A
2080          * Preemption switch latency: C - B
2081          */
2082
2083         if (!intel_engine_has_preemption(ce->engine))
2084                 return 0;
2085
2086         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2087                 u32 addr = offset + 2 * i * sizeof(u32);
2088                 struct i915_request *rq;
2089
2090                 rq = i915_request_create(ce);
2091                 if (IS_ERR(rq)) {
2092                         err = PTR_ERR(rq);
2093                         goto err;
2094                 }
2095
2096                 cs = intel_ring_begin(rq, 12);
2097                 if (IS_ERR(cs)) {
2098                         i915_request_add(rq);
2099                         err = PTR_ERR(cs);
2100                         goto err;
2101                 }
2102
2103                 cs = emit_store_dw(cs, addr, -1);
2104                 cs = emit_semaphore_poll_until(cs, offset, i);
2105                 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2106
2107                 intel_ring_advance(rq, cs);
2108                 i915_request_add(rq);
2109
2110                 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2111                         err = -EIO;
2112                         goto err;
2113                 }
2114
2115                 rq = i915_request_create(ce->engine->kernel_context);
2116                 if (IS_ERR(rq)) {
2117                         err = PTR_ERR(rq);
2118                         goto err;
2119                 }
2120
2121                 cs = intel_ring_begin(rq, 8);
2122                 if (IS_ERR(cs)) {
2123                         i915_request_add(rq);
2124                         err = PTR_ERR(cs);
2125                         goto err;
2126                 }
2127
2128                 cs = emit_timestamp_store(cs, ce, addr);
2129                 cs = emit_store_dw(cs, offset, i);
2130
2131                 intel_ring_advance(rq, cs);
2132                 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2133
2134                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2135                 i915_request_add(rq);
2136         }
2137
2138         if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2139                 err = -EIO;
2140                 goto err;
2141         }
2142
2143         for (i = 1; i <= TF_COUNT; i++)
2144                 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2145
2146         cycles = trifilter(elapsed);
2147         pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2148                 ce->engine->name, cycles >> TF_BIAS,
2149                 cycles_to_ns(ce->engine, cycles));
2150
2151         for (i = 1; i <= TF_COUNT; i++)
2152                 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2153
2154         cycles = trifilter(elapsed);
2155         pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2156                 ce->engine->name, cycles >> TF_BIAS,
2157                 cycles_to_ns(ce->engine, cycles));
2158
2159         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2160
2161 err:
2162         intel_gt_set_wedged(ce->engine->gt);
2163         return err;
2164 }
2165
2166 struct signal_cb {
2167         struct dma_fence_cb base;
2168         bool seen;
2169 };
2170
2171 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2172 {
2173         struct signal_cb *s = container_of(cb, typeof(*s), base);
2174
2175         smp_store_mb(s->seen, true); /* be safe, be strong */
2176 }
2177
2178 static int measure_completion(struct intel_context *ce)
2179 {
2180         u32 *sema = hwsp_scratch(ce);
2181         const u32 offset = hwsp_offset(ce, sema);
2182         u32 elapsed[TF_COUNT], cycles;
2183         u32 *cs;
2184         int err;
2185         int i;
2186
2187         /*
2188          * Measure how long it takes for the signal (interrupt) to be
2189          * sent from the GPU to be processed by the CPU.
2190          *
2191          *    A: read CS_TIMESTAMP on GPU
2192          *    signal
2193          *    B: read CS_TIMESTAMP from CPU
2194          *
2195          * Completion latency: B - A
2196          */
2197
2198         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2199                 struct signal_cb cb = { .seen = false };
2200                 struct i915_request *rq;
2201
2202                 rq = i915_request_create(ce);
2203                 if (IS_ERR(rq)) {
2204                         err = PTR_ERR(rq);
2205                         goto err;
2206                 }
2207
2208                 cs = intel_ring_begin(rq, 12);
2209                 if (IS_ERR(cs)) {
2210                         i915_request_add(rq);
2211                         err = PTR_ERR(cs);
2212                         goto err;
2213                 }
2214
2215                 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2216                 cs = emit_semaphore_poll_until(cs, offset, i);
2217                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2218
2219                 intel_ring_advance(rq, cs);
2220
2221                 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2222                 i915_request_add(rq);
2223
2224                 intel_engine_flush_submission(ce->engine);
2225                 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2226                         err = -EIO;
2227                         goto err;
2228                 }
2229
2230                 preempt_disable();
2231                 semaphore_set(sema, i);
2232                 while (!READ_ONCE(cb.seen))
2233                         cpu_relax();
2234
2235                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2236                 preempt_enable();
2237         }
2238
2239         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2240         if (err)
2241                 goto err;
2242
2243         for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2244                 GEM_BUG_ON(sema[i + 1] == -1);
2245                 elapsed[i] = elapsed[i] - sema[i + 1];
2246         }
2247
2248         cycles = trifilter(elapsed);
2249         pr_info("%s: completion latency %d cycles, %lluns\n",
2250                 ce->engine->name, cycles >> TF_BIAS,
2251                 cycles_to_ns(ce->engine, cycles));
2252
2253         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2254
2255 err:
2256         intel_gt_set_wedged(ce->engine->gt);
2257         return err;
2258 }
2259
2260 static void rps_pin(struct intel_gt *gt)
2261 {
2262         /* Pin the frequency to max */
2263         atomic_inc(&gt->rps.num_waiters);
2264         intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2265
2266         mutex_lock(&gt->rps.lock);
2267         intel_rps_set(&gt->rps, gt->rps.max_freq);
2268         mutex_unlock(&gt->rps.lock);
2269 }
2270
2271 static void rps_unpin(struct intel_gt *gt)
2272 {
2273         intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2274         atomic_dec(&gt->rps.num_waiters);
2275 }
2276
2277 static int perf_request_latency(void *arg)
2278 {
2279         struct drm_i915_private *i915 = arg;
2280         struct intel_engine_cs *engine;
2281         struct pm_qos_request qos;
2282         int err = 0;
2283
2284         if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2285                 return 0;
2286
2287         cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2288
2289         for_each_uabi_engine(engine, i915) {
2290                 struct intel_context *ce;
2291
2292                 ce = intel_context_create(engine);
2293                 if (IS_ERR(ce)) {
2294                         err = PTR_ERR(ce);
2295                         goto out;
2296                 }
2297
2298                 err = intel_context_pin(ce);
2299                 if (err) {
2300                         intel_context_put(ce);
2301                         goto out;
2302                 }
2303
2304                 st_engine_heartbeat_disable(engine);
2305                 rps_pin(engine->gt);
2306
2307                 if (err == 0)
2308                         err = measure_semaphore_response(ce);
2309                 if (err == 0)
2310                         err = measure_idle_dispatch(ce);
2311                 if (err == 0)
2312                         err = measure_busy_dispatch(ce);
2313                 if (err == 0)
2314                         err = measure_inter_request(ce);
2315                 if (err == 0)
2316                         err = measure_context_switch(ce);
2317                 if (err == 0)
2318                         err = measure_preemption(ce);
2319                 if (err == 0)
2320                         err = measure_completion(ce);
2321
2322                 rps_unpin(engine->gt);
2323                 st_engine_heartbeat_enable(engine);
2324
2325                 intel_context_unpin(ce);
2326                 intel_context_put(ce);
2327                 if (err)
2328                         goto out;
2329         }
2330
2331 out:
2332         if (igt_flush_test(i915))
2333                 err = -EIO;
2334
2335         cpu_latency_qos_remove_request(&qos);
2336         return err;
2337 }
2338
2339 static int s_sync0(void *arg)
2340 {
2341         struct perf_series *ps = arg;
2342         IGT_TIMEOUT(end_time);
2343         unsigned int idx = 0;
2344         int err = 0;
2345
2346         GEM_BUG_ON(!ps->nengines);
2347         do {
2348                 struct i915_request *rq;
2349
2350                 rq = i915_request_create(ps->ce[idx]);
2351                 if (IS_ERR(rq)) {
2352                         err = PTR_ERR(rq);
2353                         break;
2354                 }
2355
2356                 i915_request_get(rq);
2357                 i915_request_add(rq);
2358
2359                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2360                         err = -ETIME;
2361                 i915_request_put(rq);
2362                 if (err)
2363                         break;
2364
2365                 if (++idx == ps->nengines)
2366                         idx = 0;
2367         } while (!__igt_timeout(end_time, NULL));
2368
2369         return err;
2370 }
2371
2372 static int s_sync1(void *arg)
2373 {
2374         struct perf_series *ps = arg;
2375         struct i915_request *prev = NULL;
2376         IGT_TIMEOUT(end_time);
2377         unsigned int idx = 0;
2378         int err = 0;
2379
2380         GEM_BUG_ON(!ps->nengines);
2381         do {
2382                 struct i915_request *rq;
2383
2384                 rq = i915_request_create(ps->ce[idx]);
2385                 if (IS_ERR(rq)) {
2386                         err = PTR_ERR(rq);
2387                         break;
2388                 }
2389
2390                 i915_request_get(rq);
2391                 i915_request_add(rq);
2392
2393                 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2394                         err = -ETIME;
2395                 i915_request_put(prev);
2396                 prev = rq;
2397                 if (err)
2398                         break;
2399
2400                 if (++idx == ps->nengines)
2401                         idx = 0;
2402         } while (!__igt_timeout(end_time, NULL));
2403         i915_request_put(prev);
2404
2405         return err;
2406 }
2407
2408 static int s_many(void *arg)
2409 {
2410         struct perf_series *ps = arg;
2411         IGT_TIMEOUT(end_time);
2412         unsigned int idx = 0;
2413
2414         GEM_BUG_ON(!ps->nengines);
2415         do {
2416                 struct i915_request *rq;
2417
2418                 rq = i915_request_create(ps->ce[idx]);
2419                 if (IS_ERR(rq))
2420                         return PTR_ERR(rq);
2421
2422                 i915_request_add(rq);
2423
2424                 if (++idx == ps->nengines)
2425                         idx = 0;
2426         } while (!__igt_timeout(end_time, NULL));
2427
2428         return 0;
2429 }
2430
2431 static int perf_series_engines(void *arg)
2432 {
2433         struct drm_i915_private *i915 = arg;
2434         static int (* const func[])(void *arg) = {
2435                 s_sync0,
2436                 s_sync1,
2437                 s_many,
2438                 NULL,
2439         };
2440         const unsigned int nengines = num_uabi_engines(i915);
2441         struct intel_engine_cs *engine;
2442         int (* const *fn)(void *arg);
2443         struct pm_qos_request qos;
2444         struct perf_stats *stats;
2445         struct perf_series *ps;
2446         unsigned int idx;
2447         int err = 0;
2448
2449         stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2450         if (!stats)
2451                 return -ENOMEM;
2452
2453         ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2454         if (!ps) {
2455                 kfree(stats);
2456                 return -ENOMEM;
2457         }
2458
2459         cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2460
2461         ps->i915 = i915;
2462         ps->nengines = nengines;
2463
2464         idx = 0;
2465         for_each_uabi_engine(engine, i915) {
2466                 struct intel_context *ce;
2467
2468                 ce = intel_context_create(engine);
2469                 if (IS_ERR(ce)) {
2470                         err = PTR_ERR(ce);
2471                         goto out;
2472                 }
2473
2474                 err = intel_context_pin(ce);
2475                 if (err) {
2476                         intel_context_put(ce);
2477                         goto out;
2478                 }
2479
2480                 ps->ce[idx++] = ce;
2481         }
2482         GEM_BUG_ON(idx != ps->nengines);
2483
2484         for (fn = func; *fn && !err; fn++) {
2485                 char name[KSYM_NAME_LEN];
2486                 struct igt_live_test t;
2487
2488                 snprintf(name, sizeof(name), "%ps", *fn);
2489                 err = igt_live_test_begin(&t, i915, __func__, name);
2490                 if (err)
2491                         break;
2492
2493                 for (idx = 0; idx < nengines; idx++) {
2494                         struct perf_stats *p =
2495                                 memset(&stats[idx], 0, sizeof(stats[idx]));
2496                         struct intel_context *ce = ps->ce[idx];
2497
2498                         p->engine = ps->ce[idx]->engine;
2499                         intel_engine_pm_get(p->engine);
2500
2501                         if (intel_engine_supports_stats(p->engine))
2502                                 p->busy = intel_engine_get_busy_time(p->engine,
2503                                                                      &p->time) + 1;
2504                         else
2505                                 p->time = ktime_get();
2506                         p->runtime = -intel_context_get_total_runtime_ns(ce);
2507                 }
2508
2509                 err = (*fn)(ps);
2510                 if (igt_live_test_end(&t))
2511                         err = -EIO;
2512
2513                 for (idx = 0; idx < nengines; idx++) {
2514                         struct perf_stats *p = &stats[idx];
2515                         struct intel_context *ce = ps->ce[idx];
2516                         int integer, decimal;
2517                         u64 busy, dt, now;
2518
2519                         if (p->busy)
2520                                 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2521                                                                                &now),
2522                                                     p->busy - 1);
2523                         else
2524                                 now = ktime_get();
2525                         p->time = ktime_sub(now, p->time);
2526
2527                         err = switch_to_kernel_sync(ce, err);
2528                         p->runtime += intel_context_get_total_runtime_ns(ce);
2529                         intel_engine_pm_put(p->engine);
2530
2531                         busy = 100 * ktime_to_ns(p->busy);
2532                         dt = ktime_to_ns(p->time);
2533                         if (dt) {
2534                                 integer = div64_u64(busy, dt);
2535                                 busy -= integer * dt;
2536                                 decimal = div64_u64(100 * busy, dt);
2537                         } else {
2538                                 integer = 0;
2539                                 decimal = 0;
2540                         }
2541
2542                         pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2543                                 name, p->engine->name, ce->timeline->seqno,
2544                                 integer, decimal,
2545                                 div_u64(p->runtime, 1000 * 1000),
2546                                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2547                 }
2548         }
2549
2550 out:
2551         for (idx = 0; idx < nengines; idx++) {
2552                 if (IS_ERR_OR_NULL(ps->ce[idx]))
2553                         break;
2554
2555                 intel_context_unpin(ps->ce[idx]);
2556                 intel_context_put(ps->ce[idx]);
2557         }
2558         kfree(ps);
2559
2560         cpu_latency_qos_remove_request(&qos);
2561         kfree(stats);
2562         return err;
2563 }
2564
2565 static int p_sync0(void *arg)
2566 {
2567         struct perf_stats *p = arg;
2568         struct intel_engine_cs *engine = p->engine;
2569         struct intel_context *ce;
2570         IGT_TIMEOUT(end_time);
2571         unsigned long count;
2572         bool busy;
2573         int err = 0;
2574
2575         ce = intel_context_create(engine);
2576         if (IS_ERR(ce))
2577                 return PTR_ERR(ce);
2578
2579         err = intel_context_pin(ce);
2580         if (err) {
2581                 intel_context_put(ce);
2582                 return err;
2583         }
2584
2585         if (intel_engine_supports_stats(engine)) {
2586                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2587                 busy = true;
2588         } else {
2589                 p->time = ktime_get();
2590                 busy = false;
2591         }
2592
2593         count = 0;
2594         do {
2595                 struct i915_request *rq;
2596
2597                 rq = i915_request_create(ce);
2598                 if (IS_ERR(rq)) {
2599                         err = PTR_ERR(rq);
2600                         break;
2601                 }
2602
2603                 i915_request_get(rq);
2604                 i915_request_add(rq);
2605
2606                 err = 0;
2607                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2608                         err = -ETIME;
2609                 i915_request_put(rq);
2610                 if (err)
2611                         break;
2612
2613                 count++;
2614         } while (!__igt_timeout(end_time, NULL));
2615
2616         if (busy) {
2617                 ktime_t now;
2618
2619                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2620                                     p->busy);
2621                 p->time = ktime_sub(now, p->time);
2622         } else {
2623                 p->time = ktime_sub(ktime_get(), p->time);
2624         }
2625
2626         err = switch_to_kernel_sync(ce, err);
2627         p->runtime = intel_context_get_total_runtime_ns(ce);
2628         p->count = count;
2629
2630         intel_context_unpin(ce);
2631         intel_context_put(ce);
2632         return err;
2633 }
2634
2635 static int p_sync1(void *arg)
2636 {
2637         struct perf_stats *p = arg;
2638         struct intel_engine_cs *engine = p->engine;
2639         struct i915_request *prev = NULL;
2640         struct intel_context *ce;
2641         IGT_TIMEOUT(end_time);
2642         unsigned long count;
2643         bool busy;
2644         int err = 0;
2645
2646         ce = intel_context_create(engine);
2647         if (IS_ERR(ce))
2648                 return PTR_ERR(ce);
2649
2650         err = intel_context_pin(ce);
2651         if (err) {
2652                 intel_context_put(ce);
2653                 return err;
2654         }
2655
2656         if (intel_engine_supports_stats(engine)) {
2657                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2658                 busy = true;
2659         } else {
2660                 p->time = ktime_get();
2661                 busy = false;
2662         }
2663
2664         count = 0;
2665         do {
2666                 struct i915_request *rq;
2667
2668                 rq = i915_request_create(ce);
2669                 if (IS_ERR(rq)) {
2670                         err = PTR_ERR(rq);
2671                         break;
2672                 }
2673
2674                 i915_request_get(rq);
2675                 i915_request_add(rq);
2676
2677                 err = 0;
2678                 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2679                         err = -ETIME;
2680                 i915_request_put(prev);
2681                 prev = rq;
2682                 if (err)
2683                         break;
2684
2685                 count++;
2686         } while (!__igt_timeout(end_time, NULL));
2687         i915_request_put(prev);
2688
2689         if (busy) {
2690                 ktime_t now;
2691
2692                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2693                                     p->busy);
2694                 p->time = ktime_sub(now, p->time);
2695         } else {
2696                 p->time = ktime_sub(ktime_get(), p->time);
2697         }
2698
2699         err = switch_to_kernel_sync(ce, err);
2700         p->runtime = intel_context_get_total_runtime_ns(ce);
2701         p->count = count;
2702
2703         intel_context_unpin(ce);
2704         intel_context_put(ce);
2705         return err;
2706 }
2707
2708 static int p_many(void *arg)
2709 {
2710         struct perf_stats *p = arg;
2711         struct intel_engine_cs *engine = p->engine;
2712         struct intel_context *ce;
2713         IGT_TIMEOUT(end_time);
2714         unsigned long count;
2715         int err = 0;
2716         bool busy;
2717
2718         ce = intel_context_create(engine);
2719         if (IS_ERR(ce))
2720                 return PTR_ERR(ce);
2721
2722         err = intel_context_pin(ce);
2723         if (err) {
2724                 intel_context_put(ce);
2725                 return err;
2726         }
2727
2728         if (intel_engine_supports_stats(engine)) {
2729                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2730                 busy = true;
2731         } else {
2732                 p->time = ktime_get();
2733                 busy = false;
2734         }
2735
2736         count = 0;
2737         do {
2738                 struct i915_request *rq;
2739
2740                 rq = i915_request_create(ce);
2741                 if (IS_ERR(rq)) {
2742                         err = PTR_ERR(rq);
2743                         break;
2744                 }
2745
2746                 i915_request_add(rq);
2747                 count++;
2748         } while (!__igt_timeout(end_time, NULL));
2749
2750         if (busy) {
2751                 ktime_t now;
2752
2753                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2754                                     p->busy);
2755                 p->time = ktime_sub(now, p->time);
2756         } else {
2757                 p->time = ktime_sub(ktime_get(), p->time);
2758         }
2759
2760         err = switch_to_kernel_sync(ce, err);
2761         p->runtime = intel_context_get_total_runtime_ns(ce);
2762         p->count = count;
2763
2764         intel_context_unpin(ce);
2765         intel_context_put(ce);
2766         return err;
2767 }
2768
2769 static int perf_parallel_engines(void *arg)
2770 {
2771         struct drm_i915_private *i915 = arg;
2772         static int (* const func[])(void *arg) = {
2773                 p_sync0,
2774                 p_sync1,
2775                 p_many,
2776                 NULL,
2777         };
2778         const unsigned int nengines = num_uabi_engines(i915);
2779         struct intel_engine_cs *engine;
2780         int (* const *fn)(void *arg);
2781         struct pm_qos_request qos;
2782         struct {
2783                 struct perf_stats p;
2784                 struct task_struct *tsk;
2785         } *engines;
2786         int err = 0;
2787
2788         engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2789         if (!engines)
2790                 return -ENOMEM;
2791
2792         cpu_latency_qos_add_request(&qos, 0);
2793
2794         for (fn = func; *fn; fn++) {
2795                 char name[KSYM_NAME_LEN];
2796                 struct igt_live_test t;
2797                 unsigned int idx;
2798
2799                 snprintf(name, sizeof(name), "%ps", *fn);
2800                 err = igt_live_test_begin(&t, i915, __func__, name);
2801                 if (err)
2802                         break;
2803
2804                 atomic_set(&i915->selftest.counter, nengines);
2805
2806                 idx = 0;
2807                 for_each_uabi_engine(engine, i915) {
2808                         intel_engine_pm_get(engine);
2809
2810                         memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2811                         engines[idx].p.engine = engine;
2812
2813                         engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2814                                                        "igt:%s", engine->name);
2815                         if (IS_ERR(engines[idx].tsk)) {
2816                                 err = PTR_ERR(engines[idx].tsk);
2817                                 intel_engine_pm_put(engine);
2818                                 break;
2819                         }
2820                         get_task_struct(engines[idx++].tsk);
2821                 }
2822
2823                 yield(); /* start all threads before we kthread_stop() */
2824
2825                 idx = 0;
2826                 for_each_uabi_engine(engine, i915) {
2827                         int status;
2828
2829                         if (IS_ERR(engines[idx].tsk))
2830                                 break;
2831
2832                         status = kthread_stop(engines[idx].tsk);
2833                         if (status && !err)
2834                                 err = status;
2835
2836                         intel_engine_pm_put(engine);
2837                         put_task_struct(engines[idx++].tsk);
2838                 }
2839
2840                 if (igt_live_test_end(&t))
2841                         err = -EIO;
2842                 if (err)
2843                         break;
2844
2845                 idx = 0;
2846                 for_each_uabi_engine(engine, i915) {
2847                         struct perf_stats *p = &engines[idx].p;
2848                         u64 busy = 100 * ktime_to_ns(p->busy);
2849                         u64 dt = ktime_to_ns(p->time);
2850                         int integer, decimal;
2851
2852                         if (dt) {
2853                                 integer = div64_u64(busy, dt);
2854                                 busy -= integer * dt;
2855                                 decimal = div64_u64(100 * busy, dt);
2856                         } else {
2857                                 integer = 0;
2858                                 decimal = 0;
2859                         }
2860
2861                         GEM_BUG_ON(engine != p->engine);
2862                         pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2863                                 name, engine->name, p->count, integer, decimal,
2864                                 div_u64(p->runtime, 1000 * 1000),
2865                                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2866                         idx++;
2867                 }
2868         }
2869
2870         cpu_latency_qos_remove_request(&qos);
2871         kfree(engines);
2872         return err;
2873 }
2874
2875 int i915_request_perf_selftests(struct drm_i915_private *i915)
2876 {
2877         static const struct i915_subtest tests[] = {
2878                 SUBTEST(perf_request_latency),
2879                 SUBTEST(perf_series_engines),
2880                 SUBTEST(perf_parallel_engines),
2881         };
2882
2883         if (intel_gt_is_wedged(&i915->gt))
2884                 return 0;
2885
2886         return i915_subtests(tests, i915);
2887 }