Merge tag 'arc-5.13-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupt...
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / selftests / i915_request.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
31
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_clock_utils.h"
37 #include "gt/intel_gt_requests.h"
38 #include "gt/selftest_engine_heartbeat.h"
39
40 #include "i915_random.h"
41 #include "i915_selftest.h"
42 #include "igt_flush_test.h"
43 #include "igt_live_test.h"
44 #include "igt_spinner.h"
45 #include "lib_sw_fence.h"
46
47 #include "mock_drm.h"
48 #include "mock_gem_device.h"
49
50 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51 {
52         struct intel_engine_cs *engine;
53         unsigned int count;
54
55         count = 0;
56         for_each_uabi_engine(engine, i915)
57                 count++;
58
59         return count;
60 }
61
62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63 {
64         return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
65 }
66
67 static int igt_add_request(void *arg)
68 {
69         struct drm_i915_private *i915 = arg;
70         struct i915_request *request;
71
72         /* Basic preliminary test to create a request and let it loose! */
73
74         request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
75         if (!request)
76                 return -ENOMEM;
77
78         i915_request_add(request);
79
80         return 0;
81 }
82
83 static int igt_wait_request(void *arg)
84 {
85         const long T = HZ / 4;
86         struct drm_i915_private *i915 = arg;
87         struct i915_request *request;
88         int err = -EINVAL;
89
90         /* Submit a request, then wait upon it */
91
92         request = mock_request(rcs0(i915)->kernel_context, T);
93         if (!request)
94                 return -ENOMEM;
95
96         i915_request_get(request);
97
98         if (i915_request_wait(request, 0, 0) != -ETIME) {
99                 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
100                 goto out_request;
101         }
102
103         if (i915_request_wait(request, 0, T) != -ETIME) {
104                 pr_err("request wait succeeded (expected timeout before submit!)\n");
105                 goto out_request;
106         }
107
108         if (i915_request_completed(request)) {
109                 pr_err("request completed before submit!!\n");
110                 goto out_request;
111         }
112
113         i915_request_add(request);
114
115         if (i915_request_wait(request, 0, 0) != -ETIME) {
116                 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
117                 goto out_request;
118         }
119
120         if (i915_request_completed(request)) {
121                 pr_err("request completed immediately!\n");
122                 goto out_request;
123         }
124
125         if (i915_request_wait(request, 0, T / 2) != -ETIME) {
126                 pr_err("request wait succeeded (expected timeout!)\n");
127                 goto out_request;
128         }
129
130         if (i915_request_wait(request, 0, T) == -ETIME) {
131                 pr_err("request wait timed out!\n");
132                 goto out_request;
133         }
134
135         if (!i915_request_completed(request)) {
136                 pr_err("request not complete after waiting!\n");
137                 goto out_request;
138         }
139
140         if (i915_request_wait(request, 0, T) == -ETIME) {
141                 pr_err("request wait timed out when already complete!\n");
142                 goto out_request;
143         }
144
145         err = 0;
146 out_request:
147         i915_request_put(request);
148         mock_device_flush(i915);
149         return err;
150 }
151
152 static int igt_fence_wait(void *arg)
153 {
154         const long T = HZ / 4;
155         struct drm_i915_private *i915 = arg;
156         struct i915_request *request;
157         int err = -EINVAL;
158
159         /* Submit a request, treat it as a fence and wait upon it */
160
161         request = mock_request(rcs0(i915)->kernel_context, T);
162         if (!request)
163                 return -ENOMEM;
164
165         if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
166                 pr_err("fence wait success before submit (expected timeout)!\n");
167                 goto out;
168         }
169
170         i915_request_add(request);
171
172         if (dma_fence_is_signaled(&request->fence)) {
173                 pr_err("fence signaled immediately!\n");
174                 goto out;
175         }
176
177         if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
178                 pr_err("fence wait success after submit (expected timeout)!\n");
179                 goto out;
180         }
181
182         if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
183                 pr_err("fence wait timed out (expected success)!\n");
184                 goto out;
185         }
186
187         if (!dma_fence_is_signaled(&request->fence)) {
188                 pr_err("fence unsignaled after waiting!\n");
189                 goto out;
190         }
191
192         if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
193                 pr_err("fence wait timed out when complete (expected success)!\n");
194                 goto out;
195         }
196
197         err = 0;
198 out:
199         mock_device_flush(i915);
200         return err;
201 }
202
203 static int igt_request_rewind(void *arg)
204 {
205         struct drm_i915_private *i915 = arg;
206         struct i915_request *request, *vip;
207         struct i915_gem_context *ctx[2];
208         struct intel_context *ce;
209         int err = -EINVAL;
210
211         ctx[0] = mock_context(i915, "A");
212
213         ce = i915_gem_context_get_engine(ctx[0], RCS0);
214         GEM_BUG_ON(IS_ERR(ce));
215         request = mock_request(ce, 2 * HZ);
216         intel_context_put(ce);
217         if (!request) {
218                 err = -ENOMEM;
219                 goto err_context_0;
220         }
221
222         i915_request_get(request);
223         i915_request_add(request);
224
225         ctx[1] = mock_context(i915, "B");
226
227         ce = i915_gem_context_get_engine(ctx[1], RCS0);
228         GEM_BUG_ON(IS_ERR(ce));
229         vip = mock_request(ce, 0);
230         intel_context_put(ce);
231         if (!vip) {
232                 err = -ENOMEM;
233                 goto err_context_1;
234         }
235
236         /* Simulate preemption by manual reordering */
237         if (!mock_cancel_request(request)) {
238                 pr_err("failed to cancel request (already executed)!\n");
239                 i915_request_add(vip);
240                 goto err_context_1;
241         }
242         i915_request_get(vip);
243         i915_request_add(vip);
244         rcu_read_lock();
245         request->engine->submit_request(request);
246         rcu_read_unlock();
247
248
249         if (i915_request_wait(vip, 0, HZ) == -ETIME) {
250                 pr_err("timed out waiting for high priority request\n");
251                 goto err;
252         }
253
254         if (i915_request_completed(request)) {
255                 pr_err("low priority request already completed\n");
256                 goto err;
257         }
258
259         err = 0;
260 err:
261         i915_request_put(vip);
262 err_context_1:
263         mock_context_close(ctx[1]);
264         i915_request_put(request);
265 err_context_0:
266         mock_context_close(ctx[0]);
267         mock_device_flush(i915);
268         return err;
269 }
270
271 struct smoketest {
272         struct intel_engine_cs *engine;
273         struct i915_gem_context **contexts;
274         atomic_long_t num_waits, num_fences;
275         int ncontexts, max_batch;
276         struct i915_request *(*request_alloc)(struct intel_context *ce);
277 };
278
279 static struct i915_request *
280 __mock_request_alloc(struct intel_context *ce)
281 {
282         return mock_request(ce, 0);
283 }
284
285 static struct i915_request *
286 __live_request_alloc(struct intel_context *ce)
287 {
288         return intel_context_create_request(ce);
289 }
290
291 static int __igt_breadcrumbs_smoketest(void *arg)
292 {
293         struct smoketest *t = arg;
294         const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
295         const unsigned int total = 4 * t->ncontexts + 1;
296         unsigned int num_waits = 0, num_fences = 0;
297         struct i915_request **requests;
298         I915_RND_STATE(prng);
299         unsigned int *order;
300         int err = 0;
301
302         /*
303          * A very simple test to catch the most egregious of list handling bugs.
304          *
305          * At its heart, we simply create oodles of requests running across
306          * multiple kthreads and enable signaling on them, for the sole purpose
307          * of stressing our breadcrumb handling. The only inspection we do is
308          * that the fences were marked as signaled.
309          */
310
311         requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
312         if (!requests)
313                 return -ENOMEM;
314
315         order = i915_random_order(total, &prng);
316         if (!order) {
317                 err = -ENOMEM;
318                 goto out_requests;
319         }
320
321         while (!kthread_should_stop()) {
322                 struct i915_sw_fence *submit, *wait;
323                 unsigned int n, count;
324
325                 submit = heap_fence_create(GFP_KERNEL);
326                 if (!submit) {
327                         err = -ENOMEM;
328                         break;
329                 }
330
331                 wait = heap_fence_create(GFP_KERNEL);
332                 if (!wait) {
333                         i915_sw_fence_commit(submit);
334                         heap_fence_put(submit);
335                         err = -ENOMEM;
336                         break;
337                 }
338
339                 i915_random_reorder(order, total, &prng);
340                 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
341
342                 for (n = 0; n < count; n++) {
343                         struct i915_gem_context *ctx =
344                                 t->contexts[order[n] % t->ncontexts];
345                         struct i915_request *rq;
346                         struct intel_context *ce;
347
348                         ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
349                         GEM_BUG_ON(IS_ERR(ce));
350                         rq = t->request_alloc(ce);
351                         intel_context_put(ce);
352                         if (IS_ERR(rq)) {
353                                 err = PTR_ERR(rq);
354                                 count = n;
355                                 break;
356                         }
357
358                         err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
359                                                                submit,
360                                                                GFP_KERNEL);
361
362                         requests[n] = i915_request_get(rq);
363                         i915_request_add(rq);
364
365                         if (err >= 0)
366                                 err = i915_sw_fence_await_dma_fence(wait,
367                                                                     &rq->fence,
368                                                                     0,
369                                                                     GFP_KERNEL);
370
371                         if (err < 0) {
372                                 i915_request_put(rq);
373                                 count = n;
374                                 break;
375                         }
376                 }
377
378                 i915_sw_fence_commit(submit);
379                 i915_sw_fence_commit(wait);
380
381                 if (!wait_event_timeout(wait->wait,
382                                         i915_sw_fence_done(wait),
383                                         5 * HZ)) {
384                         struct i915_request *rq = requests[count - 1];
385
386                         pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
387                                atomic_read(&wait->pending), count,
388                                rq->fence.context, rq->fence.seqno,
389                                t->engine->name);
390                         GEM_TRACE_DUMP();
391
392                         intel_gt_set_wedged(t->engine->gt);
393                         GEM_BUG_ON(!i915_request_completed(rq));
394                         i915_sw_fence_wait(wait);
395                         err = -EIO;
396                 }
397
398                 for (n = 0; n < count; n++) {
399                         struct i915_request *rq = requests[n];
400
401                         if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
402                                       &rq->fence.flags)) {
403                                 pr_err("%llu:%llu was not signaled!\n",
404                                        rq->fence.context, rq->fence.seqno);
405                                 err = -EINVAL;
406                         }
407
408                         i915_request_put(rq);
409                 }
410
411                 heap_fence_put(wait);
412                 heap_fence_put(submit);
413
414                 if (err < 0)
415                         break;
416
417                 num_fences += count;
418                 num_waits++;
419
420                 cond_resched();
421         }
422
423         atomic_long_add(num_fences, &t->num_fences);
424         atomic_long_add(num_waits, &t->num_waits);
425
426         kfree(order);
427 out_requests:
428         kfree(requests);
429         return err;
430 }
431
432 static int mock_breadcrumbs_smoketest(void *arg)
433 {
434         struct drm_i915_private *i915 = arg;
435         struct smoketest t = {
436                 .engine = rcs0(i915),
437                 .ncontexts = 1024,
438                 .max_batch = 1024,
439                 .request_alloc = __mock_request_alloc
440         };
441         unsigned int ncpus = num_online_cpus();
442         struct task_struct **threads;
443         unsigned int n;
444         int ret = 0;
445
446         /*
447          * Smoketest our breadcrumb/signal handling for requests across multiple
448          * threads. A very simple test to only catch the most egregious of bugs.
449          * See __igt_breadcrumbs_smoketest();
450          */
451
452         threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
453         if (!threads)
454                 return -ENOMEM;
455
456         t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
457         if (!t.contexts) {
458                 ret = -ENOMEM;
459                 goto out_threads;
460         }
461
462         for (n = 0; n < t.ncontexts; n++) {
463                 t.contexts[n] = mock_context(t.engine->i915, "mock");
464                 if (!t.contexts[n]) {
465                         ret = -ENOMEM;
466                         goto out_contexts;
467                 }
468         }
469
470         for (n = 0; n < ncpus; n++) {
471                 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
472                                          &t, "igt/%d", n);
473                 if (IS_ERR(threads[n])) {
474                         ret = PTR_ERR(threads[n]);
475                         ncpus = n;
476                         break;
477                 }
478
479                 get_task_struct(threads[n]);
480         }
481
482         yield(); /* start all threads before we begin */
483         msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
484
485         for (n = 0; n < ncpus; n++) {
486                 int err;
487
488                 err = kthread_stop(threads[n]);
489                 if (err < 0 && !ret)
490                         ret = err;
491
492                 put_task_struct(threads[n]);
493         }
494         pr_info("Completed %lu waits for %lu fence across %d cpus\n",
495                 atomic_long_read(&t.num_waits),
496                 atomic_long_read(&t.num_fences),
497                 ncpus);
498
499 out_contexts:
500         for (n = 0; n < t.ncontexts; n++) {
501                 if (!t.contexts[n])
502                         break;
503                 mock_context_close(t.contexts[n]);
504         }
505         kfree(t.contexts);
506 out_threads:
507         kfree(threads);
508         return ret;
509 }
510
511 int i915_request_mock_selftests(void)
512 {
513         static const struct i915_subtest tests[] = {
514                 SUBTEST(igt_add_request),
515                 SUBTEST(igt_wait_request),
516                 SUBTEST(igt_fence_wait),
517                 SUBTEST(igt_request_rewind),
518                 SUBTEST(mock_breadcrumbs_smoketest),
519         };
520         struct drm_i915_private *i915;
521         intel_wakeref_t wakeref;
522         int err = 0;
523
524         i915 = mock_gem_device();
525         if (!i915)
526                 return -ENOMEM;
527
528         with_intel_runtime_pm(&i915->runtime_pm, wakeref)
529                 err = i915_subtests(tests, i915);
530
531         mock_destroy_device(i915);
532
533         return err;
534 }
535
536 static int live_nop_request(void *arg)
537 {
538         struct drm_i915_private *i915 = arg;
539         struct intel_engine_cs *engine;
540         struct igt_live_test t;
541         int err = -ENODEV;
542
543         /*
544          * Submit various sized batches of empty requests, to each engine
545          * (individually), and wait for the batch to complete. We can check
546          * the overhead of submitting requests to the hardware.
547          */
548
549         for_each_uabi_engine(engine, i915) {
550                 unsigned long n, prime;
551                 IGT_TIMEOUT(end_time);
552                 ktime_t times[2] = {};
553
554                 err = igt_live_test_begin(&t, i915, __func__, engine->name);
555                 if (err)
556                         return err;
557
558                 intel_engine_pm_get(engine);
559                 for_each_prime_number_from(prime, 1, 8192) {
560                         struct i915_request *request = NULL;
561
562                         times[1] = ktime_get_raw();
563
564                         for (n = 0; n < prime; n++) {
565                                 i915_request_put(request);
566                                 request = i915_request_create(engine->kernel_context);
567                                 if (IS_ERR(request))
568                                         return PTR_ERR(request);
569
570                                 /*
571                                  * This space is left intentionally blank.
572                                  *
573                                  * We do not actually want to perform any
574                                  * action with this request, we just want
575                                  * to measure the latency in allocation
576                                  * and submission of our breadcrumbs -
577                                  * ensuring that the bare request is sufficient
578                                  * for the system to work (i.e. proper HEAD
579                                  * tracking of the rings, interrupt handling,
580                                  * etc). It also gives us the lowest bounds
581                                  * for latency.
582                                  */
583
584                                 i915_request_get(request);
585                                 i915_request_add(request);
586                         }
587                         i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
588                         i915_request_put(request);
589
590                         times[1] = ktime_sub(ktime_get_raw(), times[1]);
591                         if (prime == 1)
592                                 times[0] = times[1];
593
594                         if (__igt_timeout(end_time, NULL))
595                                 break;
596                 }
597                 intel_engine_pm_put(engine);
598
599                 err = igt_live_test_end(&t);
600                 if (err)
601                         return err;
602
603                 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
604                         engine->name,
605                         ktime_to_ns(times[0]),
606                         prime, div64_u64(ktime_to_ns(times[1]), prime));
607         }
608
609         return err;
610 }
611
612 static int __cancel_inactive(struct intel_engine_cs *engine)
613 {
614         struct intel_context *ce;
615         struct igt_spinner spin;
616         struct i915_request *rq;
617         int err = 0;
618
619         if (igt_spinner_init(&spin, engine->gt))
620                 return -ENOMEM;
621
622         ce = intel_context_create(engine);
623         if (IS_ERR(ce)) {
624                 err = PTR_ERR(ce);
625                 goto out_spin;
626         }
627
628         rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
629         if (IS_ERR(rq)) {
630                 err = PTR_ERR(rq);
631                 goto out_ce;
632         }
633
634         pr_debug("%s: Cancelling inactive request\n", engine->name);
635         i915_request_cancel(rq, -EINTR);
636         i915_request_get(rq);
637         i915_request_add(rq);
638
639         if (i915_request_wait(rq, 0, HZ / 5) < 0) {
640                 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
641
642                 pr_err("%s: Failed to cancel inactive request\n", engine->name);
643                 intel_engine_dump(engine, &p, "%s\n", engine->name);
644                 err = -ETIME;
645                 goto out_rq;
646         }
647
648         if (rq->fence.error != -EINTR) {
649                 pr_err("%s: fence not cancelled (%u)\n",
650                        engine->name, rq->fence.error);
651                 err = -EINVAL;
652         }
653
654 out_rq:
655         i915_request_put(rq);
656 out_ce:
657         intel_context_put(ce);
658 out_spin:
659         igt_spinner_fini(&spin);
660         if (err)
661                 pr_err("%s: %s error %d\n", __func__, engine->name, err);
662         return err;
663 }
664
665 static int __cancel_active(struct intel_engine_cs *engine)
666 {
667         struct intel_context *ce;
668         struct igt_spinner spin;
669         struct i915_request *rq;
670         int err = 0;
671
672         if (igt_spinner_init(&spin, engine->gt))
673                 return -ENOMEM;
674
675         ce = intel_context_create(engine);
676         if (IS_ERR(ce)) {
677                 err = PTR_ERR(ce);
678                 goto out_spin;
679         }
680
681         rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
682         if (IS_ERR(rq)) {
683                 err = PTR_ERR(rq);
684                 goto out_ce;
685         }
686
687         pr_debug("%s: Cancelling active request\n", engine->name);
688         i915_request_get(rq);
689         i915_request_add(rq);
690         if (!igt_wait_for_spinner(&spin, rq)) {
691                 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
692
693                 pr_err("Failed to start spinner on %s\n", engine->name);
694                 intel_engine_dump(engine, &p, "%s\n", engine->name);
695                 err = -ETIME;
696                 goto out_rq;
697         }
698         i915_request_cancel(rq, -EINTR);
699
700         if (i915_request_wait(rq, 0, HZ / 5) < 0) {
701                 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
702
703                 pr_err("%s: Failed to cancel active request\n", engine->name);
704                 intel_engine_dump(engine, &p, "%s\n", engine->name);
705                 err = -ETIME;
706                 goto out_rq;
707         }
708
709         if (rq->fence.error != -EINTR) {
710                 pr_err("%s: fence not cancelled (%u)\n",
711                        engine->name, rq->fence.error);
712                 err = -EINVAL;
713         }
714
715 out_rq:
716         i915_request_put(rq);
717 out_ce:
718         intel_context_put(ce);
719 out_spin:
720         igt_spinner_fini(&spin);
721         if (err)
722                 pr_err("%s: %s error %d\n", __func__, engine->name, err);
723         return err;
724 }
725
726 static int __cancel_completed(struct intel_engine_cs *engine)
727 {
728         struct intel_context *ce;
729         struct igt_spinner spin;
730         struct i915_request *rq;
731         int err = 0;
732
733         if (igt_spinner_init(&spin, engine->gt))
734                 return -ENOMEM;
735
736         ce = intel_context_create(engine);
737         if (IS_ERR(ce)) {
738                 err = PTR_ERR(ce);
739                 goto out_spin;
740         }
741
742         rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
743         if (IS_ERR(rq)) {
744                 err = PTR_ERR(rq);
745                 goto out_ce;
746         }
747         igt_spinner_end(&spin);
748         i915_request_get(rq);
749         i915_request_add(rq);
750
751         if (i915_request_wait(rq, 0, HZ / 5) < 0) {
752                 err = -ETIME;
753                 goto out_rq;
754         }
755
756         pr_debug("%s: Cancelling completed request\n", engine->name);
757         i915_request_cancel(rq, -EINTR);
758         if (rq->fence.error) {
759                 pr_err("%s: fence not cancelled (%u)\n",
760                        engine->name, rq->fence.error);
761                 err = -EINVAL;
762         }
763
764 out_rq:
765         i915_request_put(rq);
766 out_ce:
767         intel_context_put(ce);
768 out_spin:
769         igt_spinner_fini(&spin);
770         if (err)
771                 pr_err("%s: %s error %d\n", __func__, engine->name, err);
772         return err;
773 }
774
775 static int live_cancel_request(void *arg)
776 {
777         struct drm_i915_private *i915 = arg;
778         struct intel_engine_cs *engine;
779
780         /*
781          * Check cancellation of requests. We expect to be able to immediately
782          * cancel active requests, even if they are currently on the GPU.
783          */
784
785         for_each_uabi_engine(engine, i915) {
786                 struct igt_live_test t;
787                 int err, err2;
788
789                 if (!intel_engine_has_preemption(engine))
790                         continue;
791
792                 err = igt_live_test_begin(&t, i915, __func__, engine->name);
793                 if (err)
794                         return err;
795
796                 err = __cancel_inactive(engine);
797                 if (err == 0)
798                         err = __cancel_active(engine);
799                 if (err == 0)
800                         err = __cancel_completed(engine);
801
802                 err2 = igt_live_test_end(&t);
803                 if (err)
804                         return err;
805                 if (err2)
806                         return err2;
807         }
808
809         return 0;
810 }
811
812 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
813 {
814         struct drm_i915_gem_object *obj;
815         struct i915_vma *vma;
816         u32 *cmd;
817         int err;
818
819         obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
820         if (IS_ERR(obj))
821                 return ERR_CAST(obj);
822
823         cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
824         if (IS_ERR(cmd)) {
825                 err = PTR_ERR(cmd);
826                 goto err;
827         }
828
829         *cmd = MI_BATCH_BUFFER_END;
830
831         __i915_gem_object_flush_map(obj, 0, 64);
832         i915_gem_object_unpin_map(obj);
833
834         intel_gt_chipset_flush(&i915->gt);
835
836         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
837         if (IS_ERR(vma)) {
838                 err = PTR_ERR(vma);
839                 goto err;
840         }
841
842         err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
843         if (err)
844                 goto err;
845
846         /* Force the wait wait now to avoid including it in the benchmark */
847         err = i915_vma_sync(vma);
848         if (err)
849                 goto err_pin;
850
851         return vma;
852
853 err_pin:
854         i915_vma_unpin(vma);
855 err:
856         i915_gem_object_put(obj);
857         return ERR_PTR(err);
858 }
859
860 static struct i915_request *
861 empty_request(struct intel_engine_cs *engine,
862               struct i915_vma *batch)
863 {
864         struct i915_request *request;
865         int err;
866
867         request = i915_request_create(engine->kernel_context);
868         if (IS_ERR(request))
869                 return request;
870
871         err = engine->emit_bb_start(request,
872                                     batch->node.start,
873                                     batch->node.size,
874                                     I915_DISPATCH_SECURE);
875         if (err)
876                 goto out_request;
877
878         i915_request_get(request);
879 out_request:
880         i915_request_add(request);
881         return err ? ERR_PTR(err) : request;
882 }
883
884 static int live_empty_request(void *arg)
885 {
886         struct drm_i915_private *i915 = arg;
887         struct intel_engine_cs *engine;
888         struct igt_live_test t;
889         struct i915_vma *batch;
890         int err = 0;
891
892         /*
893          * Submit various sized batches of empty requests, to each engine
894          * (individually), and wait for the batch to complete. We can check
895          * the overhead of submitting requests to the hardware.
896          */
897
898         batch = empty_batch(i915);
899         if (IS_ERR(batch))
900                 return PTR_ERR(batch);
901
902         for_each_uabi_engine(engine, i915) {
903                 IGT_TIMEOUT(end_time);
904                 struct i915_request *request;
905                 unsigned long n, prime;
906                 ktime_t times[2] = {};
907
908                 err = igt_live_test_begin(&t, i915, __func__, engine->name);
909                 if (err)
910                         goto out_batch;
911
912                 intel_engine_pm_get(engine);
913
914                 /* Warmup / preload */
915                 request = empty_request(engine, batch);
916                 if (IS_ERR(request)) {
917                         err = PTR_ERR(request);
918                         intel_engine_pm_put(engine);
919                         goto out_batch;
920                 }
921                 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
922
923                 for_each_prime_number_from(prime, 1, 8192) {
924                         times[1] = ktime_get_raw();
925
926                         for (n = 0; n < prime; n++) {
927                                 i915_request_put(request);
928                                 request = empty_request(engine, batch);
929                                 if (IS_ERR(request)) {
930                                         err = PTR_ERR(request);
931                                         intel_engine_pm_put(engine);
932                                         goto out_batch;
933                                 }
934                         }
935                         i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
936
937                         times[1] = ktime_sub(ktime_get_raw(), times[1]);
938                         if (prime == 1)
939                                 times[0] = times[1];
940
941                         if (__igt_timeout(end_time, NULL))
942                                 break;
943                 }
944                 i915_request_put(request);
945                 intel_engine_pm_put(engine);
946
947                 err = igt_live_test_end(&t);
948                 if (err)
949                         goto out_batch;
950
951                 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
952                         engine->name,
953                         ktime_to_ns(times[0]),
954                         prime, div64_u64(ktime_to_ns(times[1]), prime));
955         }
956
957 out_batch:
958         i915_vma_unpin(batch);
959         i915_vma_put(batch);
960         return err;
961 }
962
963 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
964 {
965         struct drm_i915_gem_object *obj;
966         const int gen = INTEL_GEN(i915);
967         struct i915_vma *vma;
968         u32 *cmd;
969         int err;
970
971         obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
972         if (IS_ERR(obj))
973                 return ERR_CAST(obj);
974
975         vma = i915_vma_instance(obj, i915->gt.vm, NULL);
976         if (IS_ERR(vma)) {
977                 err = PTR_ERR(vma);
978                 goto err;
979         }
980
981         err = i915_vma_pin(vma, 0, 0, PIN_USER);
982         if (err)
983                 goto err;
984
985         cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
986         if (IS_ERR(cmd)) {
987                 err = PTR_ERR(cmd);
988                 goto err;
989         }
990
991         if (gen >= 8) {
992                 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
993                 *cmd++ = lower_32_bits(vma->node.start);
994                 *cmd++ = upper_32_bits(vma->node.start);
995         } else if (gen >= 6) {
996                 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
997                 *cmd++ = lower_32_bits(vma->node.start);
998         } else {
999                 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1000                 *cmd++ = lower_32_bits(vma->node.start);
1001         }
1002         *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1003
1004         __i915_gem_object_flush_map(obj, 0, 64);
1005         i915_gem_object_unpin_map(obj);
1006
1007         intel_gt_chipset_flush(&i915->gt);
1008
1009         return vma;
1010
1011 err:
1012         i915_gem_object_put(obj);
1013         return ERR_PTR(err);
1014 }
1015
1016 static int recursive_batch_resolve(struct i915_vma *batch)
1017 {
1018         u32 *cmd;
1019
1020         cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1021         if (IS_ERR(cmd))
1022                 return PTR_ERR(cmd);
1023
1024         *cmd = MI_BATCH_BUFFER_END;
1025
1026         __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1027         i915_gem_object_unpin_map(batch->obj);
1028
1029         intel_gt_chipset_flush(batch->vm->gt);
1030
1031         return 0;
1032 }
1033
1034 static int live_all_engines(void *arg)
1035 {
1036         struct drm_i915_private *i915 = arg;
1037         const unsigned int nengines = num_uabi_engines(i915);
1038         struct intel_engine_cs *engine;
1039         struct i915_request **request;
1040         struct igt_live_test t;
1041         struct i915_vma *batch;
1042         unsigned int idx;
1043         int err;
1044
1045         /*
1046          * Check we can submit requests to all engines simultaneously. We
1047          * send a recursive batch to each engine - checking that we don't
1048          * block doing so, and that they don't complete too soon.
1049          */
1050
1051         request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1052         if (!request)
1053                 return -ENOMEM;
1054
1055         err = igt_live_test_begin(&t, i915, __func__, "");
1056         if (err)
1057                 goto out_free;
1058
1059         batch = recursive_batch(i915);
1060         if (IS_ERR(batch)) {
1061                 err = PTR_ERR(batch);
1062                 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1063                 goto out_free;
1064         }
1065
1066         i915_vma_lock(batch);
1067
1068         idx = 0;
1069         for_each_uabi_engine(engine, i915) {
1070                 request[idx] = intel_engine_create_kernel_request(engine);
1071                 if (IS_ERR(request[idx])) {
1072                         err = PTR_ERR(request[idx]);
1073                         pr_err("%s: Request allocation failed with err=%d\n",
1074                                __func__, err);
1075                         goto out_request;
1076                 }
1077
1078                 err = i915_request_await_object(request[idx], batch->obj, 0);
1079                 if (err == 0)
1080                         err = i915_vma_move_to_active(batch, request[idx], 0);
1081                 GEM_BUG_ON(err);
1082
1083                 err = engine->emit_bb_start(request[idx],
1084                                             batch->node.start,
1085                                             batch->node.size,
1086                                             0);
1087                 GEM_BUG_ON(err);
1088                 request[idx]->batch = batch;
1089
1090                 i915_request_get(request[idx]);
1091                 i915_request_add(request[idx]);
1092                 idx++;
1093         }
1094
1095         i915_vma_unlock(batch);
1096
1097         idx = 0;
1098         for_each_uabi_engine(engine, i915) {
1099                 if (i915_request_completed(request[idx])) {
1100                         pr_err("%s(%s): request completed too early!\n",
1101                                __func__, engine->name);
1102                         err = -EINVAL;
1103                         goto out_request;
1104                 }
1105                 idx++;
1106         }
1107
1108         err = recursive_batch_resolve(batch);
1109         if (err) {
1110                 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1111                 goto out_request;
1112         }
1113
1114         idx = 0;
1115         for_each_uabi_engine(engine, i915) {
1116                 long timeout;
1117
1118                 timeout = i915_request_wait(request[idx], 0,
1119                                             MAX_SCHEDULE_TIMEOUT);
1120                 if (timeout < 0) {
1121                         err = timeout;
1122                         pr_err("%s: error waiting for request on %s, err=%d\n",
1123                                __func__, engine->name, err);
1124                         goto out_request;
1125                 }
1126
1127                 GEM_BUG_ON(!i915_request_completed(request[idx]));
1128                 i915_request_put(request[idx]);
1129                 request[idx] = NULL;
1130                 idx++;
1131         }
1132
1133         err = igt_live_test_end(&t);
1134
1135 out_request:
1136         idx = 0;
1137         for_each_uabi_engine(engine, i915) {
1138                 if (request[idx])
1139                         i915_request_put(request[idx]);
1140                 idx++;
1141         }
1142         i915_vma_unpin(batch);
1143         i915_vma_put(batch);
1144 out_free:
1145         kfree(request);
1146         return err;
1147 }
1148
1149 static int live_sequential_engines(void *arg)
1150 {
1151         struct drm_i915_private *i915 = arg;
1152         const unsigned int nengines = num_uabi_engines(i915);
1153         struct i915_request **request;
1154         struct i915_request *prev = NULL;
1155         struct intel_engine_cs *engine;
1156         struct igt_live_test t;
1157         unsigned int idx;
1158         int err;
1159
1160         /*
1161          * Check we can submit requests to all engines sequentially, such
1162          * that each successive request waits for the earlier ones. This
1163          * tests that we don't execute requests out of order, even though
1164          * they are running on independent engines.
1165          */
1166
1167         request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1168         if (!request)
1169                 return -ENOMEM;
1170
1171         err = igt_live_test_begin(&t, i915, __func__, "");
1172         if (err)
1173                 goto out_free;
1174
1175         idx = 0;
1176         for_each_uabi_engine(engine, i915) {
1177                 struct i915_vma *batch;
1178
1179                 batch = recursive_batch(i915);
1180                 if (IS_ERR(batch)) {
1181                         err = PTR_ERR(batch);
1182                         pr_err("%s: Unable to create batch for %s, err=%d\n",
1183                                __func__, engine->name, err);
1184                         goto out_free;
1185                 }
1186
1187                 i915_vma_lock(batch);
1188                 request[idx] = intel_engine_create_kernel_request(engine);
1189                 if (IS_ERR(request[idx])) {
1190                         err = PTR_ERR(request[idx]);
1191                         pr_err("%s: Request allocation failed for %s with err=%d\n",
1192                                __func__, engine->name, err);
1193                         goto out_unlock;
1194                 }
1195
1196                 if (prev) {
1197                         err = i915_request_await_dma_fence(request[idx],
1198                                                            &prev->fence);
1199                         if (err) {
1200                                 i915_request_add(request[idx]);
1201                                 pr_err("%s: Request await failed for %s with err=%d\n",
1202                                        __func__, engine->name, err);
1203                                 goto out_unlock;
1204                         }
1205                 }
1206
1207                 err = i915_request_await_object(request[idx],
1208                                                 batch->obj, false);
1209                 if (err == 0)
1210                         err = i915_vma_move_to_active(batch, request[idx], 0);
1211                 GEM_BUG_ON(err);
1212
1213                 err = engine->emit_bb_start(request[idx],
1214                                             batch->node.start,
1215                                             batch->node.size,
1216                                             0);
1217                 GEM_BUG_ON(err);
1218                 request[idx]->batch = batch;
1219
1220                 i915_request_get(request[idx]);
1221                 i915_request_add(request[idx]);
1222
1223                 prev = request[idx];
1224                 idx++;
1225
1226 out_unlock:
1227                 i915_vma_unlock(batch);
1228                 if (err)
1229                         goto out_request;
1230         }
1231
1232         idx = 0;
1233         for_each_uabi_engine(engine, i915) {
1234                 long timeout;
1235
1236                 if (i915_request_completed(request[idx])) {
1237                         pr_err("%s(%s): request completed too early!\n",
1238                                __func__, engine->name);
1239                         err = -EINVAL;
1240                         goto out_request;
1241                 }
1242
1243                 err = recursive_batch_resolve(request[idx]->batch);
1244                 if (err) {
1245                         pr_err("%s: failed to resolve batch, err=%d\n",
1246                                __func__, err);
1247                         goto out_request;
1248                 }
1249
1250                 timeout = i915_request_wait(request[idx], 0,
1251                                             MAX_SCHEDULE_TIMEOUT);
1252                 if (timeout < 0) {
1253                         err = timeout;
1254                         pr_err("%s: error waiting for request on %s, err=%d\n",
1255                                __func__, engine->name, err);
1256                         goto out_request;
1257                 }
1258
1259                 GEM_BUG_ON(!i915_request_completed(request[idx]));
1260                 idx++;
1261         }
1262
1263         err = igt_live_test_end(&t);
1264
1265 out_request:
1266         idx = 0;
1267         for_each_uabi_engine(engine, i915) {
1268                 u32 *cmd;
1269
1270                 if (!request[idx])
1271                         break;
1272
1273                 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1274                                                        I915_MAP_WC);
1275                 if (!IS_ERR(cmd)) {
1276                         *cmd = MI_BATCH_BUFFER_END;
1277
1278                         __i915_gem_object_flush_map(request[idx]->batch->obj,
1279                                                     0, sizeof(*cmd));
1280                         i915_gem_object_unpin_map(request[idx]->batch->obj);
1281
1282                         intel_gt_chipset_flush(engine->gt);
1283                 }
1284
1285                 i915_vma_put(request[idx]->batch);
1286                 i915_request_put(request[idx]);
1287                 idx++;
1288         }
1289 out_free:
1290         kfree(request);
1291         return err;
1292 }
1293
1294 static int __live_parallel_engine1(void *arg)
1295 {
1296         struct intel_engine_cs *engine = arg;
1297         IGT_TIMEOUT(end_time);
1298         unsigned long count;
1299         int err = 0;
1300
1301         count = 0;
1302         intel_engine_pm_get(engine);
1303         do {
1304                 struct i915_request *rq;
1305
1306                 rq = i915_request_create(engine->kernel_context);
1307                 if (IS_ERR(rq)) {
1308                         err = PTR_ERR(rq);
1309                         break;
1310                 }
1311
1312                 i915_request_get(rq);
1313                 i915_request_add(rq);
1314
1315                 err = 0;
1316                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1317                         err = -ETIME;
1318                 i915_request_put(rq);
1319                 if (err)
1320                         break;
1321
1322                 count++;
1323         } while (!__igt_timeout(end_time, NULL));
1324         intel_engine_pm_put(engine);
1325
1326         pr_info("%s: %lu request + sync\n", engine->name, count);
1327         return err;
1328 }
1329
1330 static int __live_parallel_engineN(void *arg)
1331 {
1332         struct intel_engine_cs *engine = arg;
1333         IGT_TIMEOUT(end_time);
1334         unsigned long count;
1335         int err = 0;
1336
1337         count = 0;
1338         intel_engine_pm_get(engine);
1339         do {
1340                 struct i915_request *rq;
1341
1342                 rq = i915_request_create(engine->kernel_context);
1343                 if (IS_ERR(rq)) {
1344                         err = PTR_ERR(rq);
1345                         break;
1346                 }
1347
1348                 i915_request_add(rq);
1349                 count++;
1350         } while (!__igt_timeout(end_time, NULL));
1351         intel_engine_pm_put(engine);
1352
1353         pr_info("%s: %lu requests\n", engine->name, count);
1354         return err;
1355 }
1356
1357 static bool wake_all(struct drm_i915_private *i915)
1358 {
1359         if (atomic_dec_and_test(&i915->selftest.counter)) {
1360                 wake_up_var(&i915->selftest.counter);
1361                 return true;
1362         }
1363
1364         return false;
1365 }
1366
1367 static int wait_for_all(struct drm_i915_private *i915)
1368 {
1369         if (wake_all(i915))
1370                 return 0;
1371
1372         if (wait_var_event_timeout(&i915->selftest.counter,
1373                                    !atomic_read(&i915->selftest.counter),
1374                                    i915_selftest.timeout_jiffies))
1375                 return 0;
1376
1377         return -ETIME;
1378 }
1379
1380 static int __live_parallel_spin(void *arg)
1381 {
1382         struct intel_engine_cs *engine = arg;
1383         struct igt_spinner spin;
1384         struct i915_request *rq;
1385         int err = 0;
1386
1387         /*
1388          * Create a spinner running for eternity on each engine. If a second
1389          * spinner is incorrectly placed on the same engine, it will not be
1390          * able to start in time.
1391          */
1392
1393         if (igt_spinner_init(&spin, engine->gt)) {
1394                 wake_all(engine->i915);
1395                 return -ENOMEM;
1396         }
1397
1398         intel_engine_pm_get(engine);
1399         rq = igt_spinner_create_request(&spin,
1400                                         engine->kernel_context,
1401                                         MI_NOOP); /* no preemption */
1402         intel_engine_pm_put(engine);
1403         if (IS_ERR(rq)) {
1404                 err = PTR_ERR(rq);
1405                 if (err == -ENODEV)
1406                         err = 0;
1407                 wake_all(engine->i915);
1408                 goto out_spin;
1409         }
1410
1411         i915_request_get(rq);
1412         i915_request_add(rq);
1413         if (igt_wait_for_spinner(&spin, rq)) {
1414                 /* Occupy this engine for the whole test */
1415                 err = wait_for_all(engine->i915);
1416         } else {
1417                 pr_err("Failed to start spinner on %s\n", engine->name);
1418                 err = -EINVAL;
1419         }
1420         igt_spinner_end(&spin);
1421
1422         if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1423                 err = -EIO;
1424         i915_request_put(rq);
1425
1426 out_spin:
1427         igt_spinner_fini(&spin);
1428         return err;
1429 }
1430
1431 static int live_parallel_engines(void *arg)
1432 {
1433         struct drm_i915_private *i915 = arg;
1434         static int (* const func[])(void *arg) = {
1435                 __live_parallel_engine1,
1436                 __live_parallel_engineN,
1437                 __live_parallel_spin,
1438                 NULL,
1439         };
1440         const unsigned int nengines = num_uabi_engines(i915);
1441         struct intel_engine_cs *engine;
1442         int (* const *fn)(void *arg);
1443         struct task_struct **tsk;
1444         int err = 0;
1445
1446         /*
1447          * Check we can submit requests to all engines concurrently. This
1448          * tests that we load up the system maximally.
1449          */
1450
1451         tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1452         if (!tsk)
1453                 return -ENOMEM;
1454
1455         for (fn = func; !err && *fn; fn++) {
1456                 char name[KSYM_NAME_LEN];
1457                 struct igt_live_test t;
1458                 unsigned int idx;
1459
1460                 snprintf(name, sizeof(name), "%ps", *fn);
1461                 err = igt_live_test_begin(&t, i915, __func__, name);
1462                 if (err)
1463                         break;
1464
1465                 atomic_set(&i915->selftest.counter, nengines);
1466
1467                 idx = 0;
1468                 for_each_uabi_engine(engine, i915) {
1469                         tsk[idx] = kthread_run(*fn, engine,
1470                                                "igt/parallel:%s",
1471                                                engine->name);
1472                         if (IS_ERR(tsk[idx])) {
1473                                 err = PTR_ERR(tsk[idx]);
1474                                 break;
1475                         }
1476                         get_task_struct(tsk[idx++]);
1477                 }
1478
1479                 yield(); /* start all threads before we kthread_stop() */
1480
1481                 idx = 0;
1482                 for_each_uabi_engine(engine, i915) {
1483                         int status;
1484
1485                         if (IS_ERR(tsk[idx]))
1486                                 break;
1487
1488                         status = kthread_stop(tsk[idx]);
1489                         if (status && !err)
1490                                 err = status;
1491
1492                         put_task_struct(tsk[idx++]);
1493                 }
1494
1495                 if (igt_live_test_end(&t))
1496                         err = -EIO;
1497         }
1498
1499         kfree(tsk);
1500         return err;
1501 }
1502
1503 static int
1504 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1505 {
1506         struct i915_request *rq;
1507         int ret;
1508
1509         /*
1510          * Before execlists, all contexts share the same ringbuffer. With
1511          * execlists, each context/engine has a separate ringbuffer and
1512          * for the purposes of this test, inexhaustible.
1513          *
1514          * For the global ringbuffer though, we have to be very careful
1515          * that we do not wrap while preventing the execution of requests
1516          * with a unsignaled fence.
1517          */
1518         if (HAS_EXECLISTS(ctx->i915))
1519                 return INT_MAX;
1520
1521         rq = igt_request_alloc(ctx, engine);
1522         if (IS_ERR(rq)) {
1523                 ret = PTR_ERR(rq);
1524         } else {
1525                 int sz;
1526
1527                 ret = rq->ring->size - rq->reserved_space;
1528                 i915_request_add(rq);
1529
1530                 sz = rq->ring->emit - rq->head;
1531                 if (sz < 0)
1532                         sz += rq->ring->size;
1533                 ret /= sz;
1534                 ret /= 2; /* leave half spare, in case of emergency! */
1535         }
1536
1537         return ret;
1538 }
1539
1540 static int live_breadcrumbs_smoketest(void *arg)
1541 {
1542         struct drm_i915_private *i915 = arg;
1543         const unsigned int nengines = num_uabi_engines(i915);
1544         const unsigned int ncpus = num_online_cpus();
1545         unsigned long num_waits, num_fences;
1546         struct intel_engine_cs *engine;
1547         struct task_struct **threads;
1548         struct igt_live_test live;
1549         intel_wakeref_t wakeref;
1550         struct smoketest *smoke;
1551         unsigned int n, idx;
1552         struct file *file;
1553         int ret = 0;
1554
1555         /*
1556          * Smoketest our breadcrumb/signal handling for requests across multiple
1557          * threads. A very simple test to only catch the most egregious of bugs.
1558          * See __igt_breadcrumbs_smoketest();
1559          *
1560          * On real hardware this time.
1561          */
1562
1563         wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1564
1565         file = mock_file(i915);
1566         if (IS_ERR(file)) {
1567                 ret = PTR_ERR(file);
1568                 goto out_rpm;
1569         }
1570
1571         smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1572         if (!smoke) {
1573                 ret = -ENOMEM;
1574                 goto out_file;
1575         }
1576
1577         threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1578         if (!threads) {
1579                 ret = -ENOMEM;
1580                 goto out_smoke;
1581         }
1582
1583         smoke[0].request_alloc = __live_request_alloc;
1584         smoke[0].ncontexts = 64;
1585         smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1586                                     sizeof(*smoke[0].contexts),
1587                                     GFP_KERNEL);
1588         if (!smoke[0].contexts) {
1589                 ret = -ENOMEM;
1590                 goto out_threads;
1591         }
1592
1593         for (n = 0; n < smoke[0].ncontexts; n++) {
1594                 smoke[0].contexts[n] = live_context(i915, file);
1595                 if (IS_ERR(smoke[0].contexts[n])) {
1596                         ret = PTR_ERR(smoke[0].contexts[n]);
1597                         goto out_contexts;
1598                 }
1599         }
1600
1601         ret = igt_live_test_begin(&live, i915, __func__, "");
1602         if (ret)
1603                 goto out_contexts;
1604
1605         idx = 0;
1606         for_each_uabi_engine(engine, i915) {
1607                 smoke[idx] = smoke[0];
1608                 smoke[idx].engine = engine;
1609                 smoke[idx].max_batch =
1610                         max_batches(smoke[0].contexts[0], engine);
1611                 if (smoke[idx].max_batch < 0) {
1612                         ret = smoke[idx].max_batch;
1613                         goto out_flush;
1614                 }
1615                 /* One ring interleaved between requests from all cpus */
1616                 smoke[idx].max_batch /= num_online_cpus() + 1;
1617                 pr_debug("Limiting batches to %d requests on %s\n",
1618                          smoke[idx].max_batch, engine->name);
1619
1620                 for (n = 0; n < ncpus; n++) {
1621                         struct task_struct *tsk;
1622
1623                         tsk = kthread_run(__igt_breadcrumbs_smoketest,
1624                                           &smoke[idx], "igt/%d.%d", idx, n);
1625                         if (IS_ERR(tsk)) {
1626                                 ret = PTR_ERR(tsk);
1627                                 goto out_flush;
1628                         }
1629
1630                         get_task_struct(tsk);
1631                         threads[idx * ncpus + n] = tsk;
1632                 }
1633
1634                 idx++;
1635         }
1636
1637         yield(); /* start all threads before we begin */
1638         msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1639
1640 out_flush:
1641         idx = 0;
1642         num_waits = 0;
1643         num_fences = 0;
1644         for_each_uabi_engine(engine, i915) {
1645                 for (n = 0; n < ncpus; n++) {
1646                         struct task_struct *tsk = threads[idx * ncpus + n];
1647                         int err;
1648
1649                         if (!tsk)
1650                                 continue;
1651
1652                         err = kthread_stop(tsk);
1653                         if (err < 0 && !ret)
1654                                 ret = err;
1655
1656                         put_task_struct(tsk);
1657                 }
1658
1659                 num_waits += atomic_long_read(&smoke[idx].num_waits);
1660                 num_fences += atomic_long_read(&smoke[idx].num_fences);
1661                 idx++;
1662         }
1663         pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1664                 num_waits, num_fences, idx, ncpus);
1665
1666         ret = igt_live_test_end(&live) ?: ret;
1667 out_contexts:
1668         kfree(smoke[0].contexts);
1669 out_threads:
1670         kfree(threads);
1671 out_smoke:
1672         kfree(smoke);
1673 out_file:
1674         fput(file);
1675 out_rpm:
1676         intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1677
1678         return ret;
1679 }
1680
1681 int i915_request_live_selftests(struct drm_i915_private *i915)
1682 {
1683         static const struct i915_subtest tests[] = {
1684                 SUBTEST(live_nop_request),
1685                 SUBTEST(live_all_engines),
1686                 SUBTEST(live_sequential_engines),
1687                 SUBTEST(live_parallel_engines),
1688                 SUBTEST(live_empty_request),
1689                 SUBTEST(live_cancel_request),
1690                 SUBTEST(live_breadcrumbs_smoketest),
1691         };
1692
1693         if (intel_gt_is_wedged(&i915->gt))
1694                 return 0;
1695
1696         return i915_subtests(tests, i915);
1697 }
1698
1699 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1700 {
1701         struct i915_request *rq;
1702         struct dma_fence *fence;
1703
1704         rq = intel_engine_create_kernel_request(ce->engine);
1705         if (IS_ERR(rq))
1706                 return PTR_ERR(rq);
1707
1708         fence = i915_active_fence_get(&ce->timeline->last_request);
1709         if (fence) {
1710                 i915_request_await_dma_fence(rq, fence);
1711                 dma_fence_put(fence);
1712         }
1713
1714         rq = i915_request_get(rq);
1715         i915_request_add(rq);
1716         if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1717                 err = -ETIME;
1718         i915_request_put(rq);
1719
1720         while (!err && !intel_engine_is_idle(ce->engine))
1721                 intel_engine_flush_submission(ce->engine);
1722
1723         return err;
1724 }
1725
1726 struct perf_stats {
1727         struct intel_engine_cs *engine;
1728         unsigned long count;
1729         ktime_t time;
1730         ktime_t busy;
1731         u64 runtime;
1732 };
1733
1734 struct perf_series {
1735         struct drm_i915_private *i915;
1736         unsigned int nengines;
1737         struct intel_context *ce[];
1738 };
1739
1740 static int cmp_u32(const void *A, const void *B)
1741 {
1742         const u32 *a = A, *b = B;
1743
1744         return *a - *b;
1745 }
1746
1747 static u32 trifilter(u32 *a)
1748 {
1749         u64 sum;
1750
1751 #define TF_COUNT 5
1752         sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1753
1754         sum = mul_u32_u32(a[2], 2);
1755         sum += a[1];
1756         sum += a[3];
1757
1758         GEM_BUG_ON(sum > U32_MAX);
1759         return sum;
1760 #define TF_BIAS 2
1761 }
1762
1763 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1764 {
1765         u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1766
1767         return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1768 }
1769
1770 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1771 {
1772         *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1773         *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1774         *cs++ = offset;
1775         *cs++ = 0;
1776
1777         return cs;
1778 }
1779
1780 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1781 {
1782         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1783         *cs++ = offset;
1784         *cs++ = 0;
1785         *cs++ = value;
1786
1787         return cs;
1788 }
1789
1790 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1791 {
1792         *cs++ = MI_SEMAPHORE_WAIT |
1793                 MI_SEMAPHORE_GLOBAL_GTT |
1794                 MI_SEMAPHORE_POLL |
1795                 mode;
1796         *cs++ = value;
1797         *cs++ = offset;
1798         *cs++ = 0;
1799
1800         return cs;
1801 }
1802
1803 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1804 {
1805         return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1806 }
1807
1808 static void semaphore_set(u32 *sema, u32 value)
1809 {
1810         WRITE_ONCE(*sema, value);
1811         wmb(); /* flush the update to the cache, and beyond */
1812 }
1813
1814 static u32 *hwsp_scratch(const struct intel_context *ce)
1815 {
1816         return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1817 }
1818
1819 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1820 {
1821         return (i915_ggtt_offset(ce->engine->status_page.vma) +
1822                 offset_in_page(dw));
1823 }
1824
1825 static int measure_semaphore_response(struct intel_context *ce)
1826 {
1827         u32 *sema = hwsp_scratch(ce);
1828         const u32 offset = hwsp_offset(ce, sema);
1829         u32 elapsed[TF_COUNT], cycles;
1830         struct i915_request *rq;
1831         u32 *cs;
1832         int err;
1833         int i;
1834
1835         /*
1836          * Measure how many cycles it takes for the HW to detect the change
1837          * in a semaphore value.
1838          *
1839          *    A: read CS_TIMESTAMP from CPU
1840          *    poke semaphore
1841          *    B: read CS_TIMESTAMP on GPU
1842          *
1843          * Semaphore latency: B - A
1844          */
1845
1846         semaphore_set(sema, -1);
1847
1848         rq = i915_request_create(ce);
1849         if (IS_ERR(rq))
1850                 return PTR_ERR(rq);
1851
1852         cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1853         if (IS_ERR(cs)) {
1854                 i915_request_add(rq);
1855                 err = PTR_ERR(cs);
1856                 goto err;
1857         }
1858
1859         cs = emit_store_dw(cs, offset, 0);
1860         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1861                 cs = emit_semaphore_poll_until(cs, offset, i);
1862                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1863                 cs = emit_store_dw(cs, offset, 0);
1864         }
1865
1866         intel_ring_advance(rq, cs);
1867         i915_request_add(rq);
1868
1869         if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1870                 err = -EIO;
1871                 goto err;
1872         }
1873
1874         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1875                 preempt_disable();
1876                 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1877                 semaphore_set(sema, i);
1878                 preempt_enable();
1879
1880                 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1881                         err = -EIO;
1882                         goto err;
1883                 }
1884
1885                 elapsed[i - 1] = sema[i] - cycles;
1886         }
1887
1888         cycles = trifilter(elapsed);
1889         pr_info("%s: semaphore response %d cycles, %lluns\n",
1890                 ce->engine->name, cycles >> TF_BIAS,
1891                 cycles_to_ns(ce->engine, cycles));
1892
1893         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1894
1895 err:
1896         intel_gt_set_wedged(ce->engine->gt);
1897         return err;
1898 }
1899
1900 static int measure_idle_dispatch(struct intel_context *ce)
1901 {
1902         u32 *sema = hwsp_scratch(ce);
1903         const u32 offset = hwsp_offset(ce, sema);
1904         u32 elapsed[TF_COUNT], cycles;
1905         u32 *cs;
1906         int err;
1907         int i;
1908
1909         /*
1910          * Measure how long it takes for us to submit a request while the
1911          * engine is idle, but is resting in our context.
1912          *
1913          *    A: read CS_TIMESTAMP from CPU
1914          *    submit request
1915          *    B: read CS_TIMESTAMP on GPU
1916          *
1917          * Submission latency: B - A
1918          */
1919
1920         for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1921                 struct i915_request *rq;
1922
1923                 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1924                 if (err)
1925                         return err;
1926
1927                 rq = i915_request_create(ce);
1928                 if (IS_ERR(rq)) {
1929                         err = PTR_ERR(rq);
1930                         goto err;
1931                 }
1932
1933                 cs = intel_ring_begin(rq, 4);
1934                 if (IS_ERR(cs)) {
1935                         i915_request_add(rq);
1936                         err = PTR_ERR(cs);
1937                         goto err;
1938                 }
1939
1940                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1941
1942                 intel_ring_advance(rq, cs);
1943
1944                 preempt_disable();
1945                 local_bh_disable();
1946                 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1947                 i915_request_add(rq);
1948                 local_bh_enable();
1949                 preempt_enable();
1950         }
1951
1952         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1953         if (err)
1954                 goto err;
1955
1956         for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1957                 elapsed[i] = sema[i] - elapsed[i];
1958
1959         cycles = trifilter(elapsed);
1960         pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1961                 ce->engine->name, cycles >> TF_BIAS,
1962                 cycles_to_ns(ce->engine, cycles));
1963
1964         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1965
1966 err:
1967         intel_gt_set_wedged(ce->engine->gt);
1968         return err;
1969 }
1970
1971 static int measure_busy_dispatch(struct intel_context *ce)
1972 {
1973         u32 *sema = hwsp_scratch(ce);
1974         const u32 offset = hwsp_offset(ce, sema);
1975         u32 elapsed[TF_COUNT + 1], cycles;
1976         u32 *cs;
1977         int err;
1978         int i;
1979
1980         /*
1981          * Measure how long it takes for us to submit a request while the
1982          * engine is busy, polling on a semaphore in our context. With
1983          * direct submission, this will include the cost of a lite restore.
1984          *
1985          *    A: read CS_TIMESTAMP from CPU
1986          *    submit request
1987          *    B: read CS_TIMESTAMP on GPU
1988          *
1989          * Submission latency: B - A
1990          */
1991
1992         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1993                 struct i915_request *rq;
1994
1995                 rq = i915_request_create(ce);
1996                 if (IS_ERR(rq)) {
1997                         err = PTR_ERR(rq);
1998                         goto err;
1999                 }
2000
2001                 cs = intel_ring_begin(rq, 12);
2002                 if (IS_ERR(cs)) {
2003                         i915_request_add(rq);
2004                         err = PTR_ERR(cs);
2005                         goto err;
2006                 }
2007
2008                 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2009                 cs = emit_semaphore_poll_until(cs, offset, i);
2010                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2011
2012                 intel_ring_advance(rq, cs);
2013
2014                 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2015                         err = -EIO;
2016                         goto err;
2017                 }
2018
2019                 preempt_disable();
2020                 local_bh_disable();
2021                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2022                 i915_request_add(rq);
2023                 local_bh_enable();
2024                 semaphore_set(sema, i - 1);
2025                 preempt_enable();
2026         }
2027
2028         wait_for(READ_ONCE(sema[i - 1]), 500);
2029         semaphore_set(sema, i - 1);
2030
2031         for (i = 1; i <= TF_COUNT; i++) {
2032                 GEM_BUG_ON(sema[i] == -1);
2033                 elapsed[i - 1] = sema[i] - elapsed[i];
2034         }
2035
2036         cycles = trifilter(elapsed);
2037         pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2038                 ce->engine->name, cycles >> TF_BIAS,
2039                 cycles_to_ns(ce->engine, cycles));
2040
2041         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2042
2043 err:
2044         intel_gt_set_wedged(ce->engine->gt);
2045         return err;
2046 }
2047
2048 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2049 {
2050         const u32 offset =
2051                 i915_ggtt_offset(engine->status_page.vma) +
2052                 offset_in_page(sema);
2053         struct i915_request *rq;
2054         u32 *cs;
2055
2056         rq = i915_request_create(engine->kernel_context);
2057         if (IS_ERR(rq))
2058                 return PTR_ERR(rq);
2059
2060         cs = intel_ring_begin(rq, 4);
2061         if (IS_ERR(cs)) {
2062                 i915_request_add(rq);
2063                 return PTR_ERR(cs);
2064         }
2065
2066         cs = emit_semaphore_poll(cs, mode, value, offset);
2067
2068         intel_ring_advance(rq, cs);
2069         i915_request_add(rq);
2070
2071         return 0;
2072 }
2073
2074 static int measure_inter_request(struct intel_context *ce)
2075 {
2076         u32 *sema = hwsp_scratch(ce);
2077         const u32 offset = hwsp_offset(ce, sema);
2078         u32 elapsed[TF_COUNT + 1], cycles;
2079         struct i915_sw_fence *submit;
2080         int i, err;
2081
2082         /*
2083          * Measure how long it takes to advance from one request into the
2084          * next. Between each request we flush the GPU caches to memory,
2085          * update the breadcrumbs, and then invalidate those caches.
2086          * We queue up all the requests to be submitted in one batch so
2087          * it should be one set of contiguous measurements.
2088          *
2089          *    A: read CS_TIMESTAMP on GPU
2090          *    advance request
2091          *    B: read CS_TIMESTAMP on GPU
2092          *
2093          * Request latency: B - A
2094          */
2095
2096         err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2097         if (err)
2098                 return err;
2099
2100         submit = heap_fence_create(GFP_KERNEL);
2101         if (!submit) {
2102                 semaphore_set(sema, 1);
2103                 return -ENOMEM;
2104         }
2105
2106         intel_engine_flush_submission(ce->engine);
2107         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2108                 struct i915_request *rq;
2109                 u32 *cs;
2110
2111                 rq = i915_request_create(ce);
2112                 if (IS_ERR(rq)) {
2113                         err = PTR_ERR(rq);
2114                         goto err_submit;
2115                 }
2116
2117                 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2118                                                        submit,
2119                                                        GFP_KERNEL);
2120                 if (err < 0) {
2121                         i915_request_add(rq);
2122                         goto err_submit;
2123                 }
2124
2125                 cs = intel_ring_begin(rq, 4);
2126                 if (IS_ERR(cs)) {
2127                         i915_request_add(rq);
2128                         err = PTR_ERR(cs);
2129                         goto err_submit;
2130                 }
2131
2132                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2133
2134                 intel_ring_advance(rq, cs);
2135                 i915_request_add(rq);
2136         }
2137         i915_sw_fence_commit(submit);
2138         intel_engine_flush_submission(ce->engine);
2139         heap_fence_put(submit);
2140
2141         semaphore_set(sema, 1);
2142         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143         if (err)
2144                 goto err;
2145
2146         for (i = 1; i <= TF_COUNT; i++)
2147                 elapsed[i - 1] = sema[i + 1] - sema[i];
2148
2149         cycles = trifilter(elapsed);
2150         pr_info("%s: inter-request latency %d cycles, %lluns\n",
2151                 ce->engine->name, cycles >> TF_BIAS,
2152                 cycles_to_ns(ce->engine, cycles));
2153
2154         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156 err_submit:
2157         i915_sw_fence_commit(submit);
2158         heap_fence_put(submit);
2159         semaphore_set(sema, 1);
2160 err:
2161         intel_gt_set_wedged(ce->engine->gt);
2162         return err;
2163 }
2164
2165 static int measure_context_switch(struct intel_context *ce)
2166 {
2167         u32 *sema = hwsp_scratch(ce);
2168         const u32 offset = hwsp_offset(ce, sema);
2169         struct i915_request *fence = NULL;
2170         u32 elapsed[TF_COUNT + 1], cycles;
2171         int i, j, err;
2172         u32 *cs;
2173
2174         /*
2175          * Measure how long it takes to advance from one request in one
2176          * context to a request in another context. This allows us to
2177          * measure how long the context save/restore take, along with all
2178          * the inter-context setup we require.
2179          *
2180          *    A: read CS_TIMESTAMP on GPU
2181          *    switch context
2182          *    B: read CS_TIMESTAMP on GPU
2183          *
2184          * Context switch latency: B - A
2185          */
2186
2187         err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2188         if (err)
2189                 return err;
2190
2191         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2192                 struct intel_context *arr[] = {
2193                         ce, ce->engine->kernel_context
2194                 };
2195                 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2196
2197                 for (j = 0; j < ARRAY_SIZE(arr); j++) {
2198                         struct i915_request *rq;
2199
2200                         rq = i915_request_create(arr[j]);
2201                         if (IS_ERR(rq)) {
2202                                 err = PTR_ERR(rq);
2203                                 goto err_fence;
2204                         }
2205
2206                         if (fence) {
2207                                 err = i915_request_await_dma_fence(rq,
2208                                                                    &fence->fence);
2209                                 if (err) {
2210                                         i915_request_add(rq);
2211                                         goto err_fence;
2212                                 }
2213                         }
2214
2215                         cs = intel_ring_begin(rq, 4);
2216                         if (IS_ERR(cs)) {
2217                                 i915_request_add(rq);
2218                                 err = PTR_ERR(cs);
2219                                 goto err_fence;
2220                         }
2221
2222                         cs = emit_timestamp_store(cs, ce, addr);
2223                         addr += sizeof(u32);
2224
2225                         intel_ring_advance(rq, cs);
2226
2227                         i915_request_put(fence);
2228                         fence = i915_request_get(rq);
2229
2230                         i915_request_add(rq);
2231                 }
2232         }
2233         i915_request_put(fence);
2234         intel_engine_flush_submission(ce->engine);
2235
2236         semaphore_set(sema, 1);
2237         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2238         if (err)
2239                 goto err;
2240
2241         for (i = 1; i <= TF_COUNT; i++)
2242                 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2243
2244         cycles = trifilter(elapsed);
2245         pr_info("%s: context switch latency %d cycles, %lluns\n",
2246                 ce->engine->name, cycles >> TF_BIAS,
2247                 cycles_to_ns(ce->engine, cycles));
2248
2249         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2250
2251 err_fence:
2252         i915_request_put(fence);
2253         semaphore_set(sema, 1);
2254 err:
2255         intel_gt_set_wedged(ce->engine->gt);
2256         return err;
2257 }
2258
2259 static int measure_preemption(struct intel_context *ce)
2260 {
2261         u32 *sema = hwsp_scratch(ce);
2262         const u32 offset = hwsp_offset(ce, sema);
2263         u32 elapsed[TF_COUNT], cycles;
2264         u32 *cs;
2265         int err;
2266         int i;
2267
2268         /*
2269          * We measure two latencies while triggering preemption. The first
2270          * latency is how long it takes for us to submit a preempting request.
2271          * The second latency is how it takes for us to return from the
2272          * preemption back to the original context.
2273          *
2274          *    A: read CS_TIMESTAMP from CPU
2275          *    submit preemption
2276          *    B: read CS_TIMESTAMP on GPU (in preempting context)
2277          *    context switch
2278          *    C: read CS_TIMESTAMP on GPU (in original context)
2279          *
2280          * Preemption dispatch latency: B - A
2281          * Preemption switch latency: C - B
2282          */
2283
2284         if (!intel_engine_has_preemption(ce->engine))
2285                 return 0;
2286
2287         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2288                 u32 addr = offset + 2 * i * sizeof(u32);
2289                 struct i915_request *rq;
2290
2291                 rq = i915_request_create(ce);
2292                 if (IS_ERR(rq)) {
2293                         err = PTR_ERR(rq);
2294                         goto err;
2295                 }
2296
2297                 cs = intel_ring_begin(rq, 12);
2298                 if (IS_ERR(cs)) {
2299                         i915_request_add(rq);
2300                         err = PTR_ERR(cs);
2301                         goto err;
2302                 }
2303
2304                 cs = emit_store_dw(cs, addr, -1);
2305                 cs = emit_semaphore_poll_until(cs, offset, i);
2306                 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2307
2308                 intel_ring_advance(rq, cs);
2309                 i915_request_add(rq);
2310
2311                 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2312                         err = -EIO;
2313                         goto err;
2314                 }
2315
2316                 rq = i915_request_create(ce->engine->kernel_context);
2317                 if (IS_ERR(rq)) {
2318                         err = PTR_ERR(rq);
2319                         goto err;
2320                 }
2321
2322                 cs = intel_ring_begin(rq, 8);
2323                 if (IS_ERR(cs)) {
2324                         i915_request_add(rq);
2325                         err = PTR_ERR(cs);
2326                         goto err;
2327                 }
2328
2329                 cs = emit_timestamp_store(cs, ce, addr);
2330                 cs = emit_store_dw(cs, offset, i);
2331
2332                 intel_ring_advance(rq, cs);
2333                 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2334
2335                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2336                 i915_request_add(rq);
2337         }
2338
2339         if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2340                 err = -EIO;
2341                 goto err;
2342         }
2343
2344         for (i = 1; i <= TF_COUNT; i++)
2345                 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2346
2347         cycles = trifilter(elapsed);
2348         pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2349                 ce->engine->name, cycles >> TF_BIAS,
2350                 cycles_to_ns(ce->engine, cycles));
2351
2352         for (i = 1; i <= TF_COUNT; i++)
2353                 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2354
2355         cycles = trifilter(elapsed);
2356         pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2357                 ce->engine->name, cycles >> TF_BIAS,
2358                 cycles_to_ns(ce->engine, cycles));
2359
2360         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2361
2362 err:
2363         intel_gt_set_wedged(ce->engine->gt);
2364         return err;
2365 }
2366
2367 struct signal_cb {
2368         struct dma_fence_cb base;
2369         bool seen;
2370 };
2371
2372 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2373 {
2374         struct signal_cb *s = container_of(cb, typeof(*s), base);
2375
2376         smp_store_mb(s->seen, true); /* be safe, be strong */
2377 }
2378
2379 static int measure_completion(struct intel_context *ce)
2380 {
2381         u32 *sema = hwsp_scratch(ce);
2382         const u32 offset = hwsp_offset(ce, sema);
2383         u32 elapsed[TF_COUNT], cycles;
2384         u32 *cs;
2385         int err;
2386         int i;
2387
2388         /*
2389          * Measure how long it takes for the signal (interrupt) to be
2390          * sent from the GPU to be processed by the CPU.
2391          *
2392          *    A: read CS_TIMESTAMP on GPU
2393          *    signal
2394          *    B: read CS_TIMESTAMP from CPU
2395          *
2396          * Completion latency: B - A
2397          */
2398
2399         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2400                 struct signal_cb cb = { .seen = false };
2401                 struct i915_request *rq;
2402
2403                 rq = i915_request_create(ce);
2404                 if (IS_ERR(rq)) {
2405                         err = PTR_ERR(rq);
2406                         goto err;
2407                 }
2408
2409                 cs = intel_ring_begin(rq, 12);
2410                 if (IS_ERR(cs)) {
2411                         i915_request_add(rq);
2412                         err = PTR_ERR(cs);
2413                         goto err;
2414                 }
2415
2416                 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2417                 cs = emit_semaphore_poll_until(cs, offset, i);
2418                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2419
2420                 intel_ring_advance(rq, cs);
2421
2422                 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2423                 i915_request_add(rq);
2424
2425                 intel_engine_flush_submission(ce->engine);
2426                 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2427                         err = -EIO;
2428                         goto err;
2429                 }
2430
2431                 preempt_disable();
2432                 semaphore_set(sema, i);
2433                 while (!READ_ONCE(cb.seen))
2434                         cpu_relax();
2435
2436                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2437                 preempt_enable();
2438         }
2439
2440         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2441         if (err)
2442                 goto err;
2443
2444         for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2445                 GEM_BUG_ON(sema[i + 1] == -1);
2446                 elapsed[i] = elapsed[i] - sema[i + 1];
2447         }
2448
2449         cycles = trifilter(elapsed);
2450         pr_info("%s: completion latency %d cycles, %lluns\n",
2451                 ce->engine->name, cycles >> TF_BIAS,
2452                 cycles_to_ns(ce->engine, cycles));
2453
2454         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2455
2456 err:
2457         intel_gt_set_wedged(ce->engine->gt);
2458         return err;
2459 }
2460
2461 static void rps_pin(struct intel_gt *gt)
2462 {
2463         /* Pin the frequency to max */
2464         atomic_inc(&gt->rps.num_waiters);
2465         intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2466
2467         mutex_lock(&gt->rps.lock);
2468         intel_rps_set(&gt->rps, gt->rps.max_freq);
2469         mutex_unlock(&gt->rps.lock);
2470 }
2471
2472 static void rps_unpin(struct intel_gt *gt)
2473 {
2474         intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2475         atomic_dec(&gt->rps.num_waiters);
2476 }
2477
2478 static int perf_request_latency(void *arg)
2479 {
2480         struct drm_i915_private *i915 = arg;
2481         struct intel_engine_cs *engine;
2482         struct pm_qos_request qos;
2483         int err = 0;
2484
2485         if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2486                 return 0;
2487
2488         cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2489
2490         for_each_uabi_engine(engine, i915) {
2491                 struct intel_context *ce;
2492
2493                 ce = intel_context_create(engine);
2494                 if (IS_ERR(ce)) {
2495                         err = PTR_ERR(ce);
2496                         goto out;
2497                 }
2498
2499                 err = intel_context_pin(ce);
2500                 if (err) {
2501                         intel_context_put(ce);
2502                         goto out;
2503                 }
2504
2505                 st_engine_heartbeat_disable(engine);
2506                 rps_pin(engine->gt);
2507
2508                 if (err == 0)
2509                         err = measure_semaphore_response(ce);
2510                 if (err == 0)
2511                         err = measure_idle_dispatch(ce);
2512                 if (err == 0)
2513                         err = measure_busy_dispatch(ce);
2514                 if (err == 0)
2515                         err = measure_inter_request(ce);
2516                 if (err == 0)
2517                         err = measure_context_switch(ce);
2518                 if (err == 0)
2519                         err = measure_preemption(ce);
2520                 if (err == 0)
2521                         err = measure_completion(ce);
2522
2523                 rps_unpin(engine->gt);
2524                 st_engine_heartbeat_enable(engine);
2525
2526                 intel_context_unpin(ce);
2527                 intel_context_put(ce);
2528                 if (err)
2529                         goto out;
2530         }
2531
2532 out:
2533         if (igt_flush_test(i915))
2534                 err = -EIO;
2535
2536         cpu_latency_qos_remove_request(&qos);
2537         return err;
2538 }
2539
2540 static int s_sync0(void *arg)
2541 {
2542         struct perf_series *ps = arg;
2543         IGT_TIMEOUT(end_time);
2544         unsigned int idx = 0;
2545         int err = 0;
2546
2547         GEM_BUG_ON(!ps->nengines);
2548         do {
2549                 struct i915_request *rq;
2550
2551                 rq = i915_request_create(ps->ce[idx]);
2552                 if (IS_ERR(rq)) {
2553                         err = PTR_ERR(rq);
2554                         break;
2555                 }
2556
2557                 i915_request_get(rq);
2558                 i915_request_add(rq);
2559
2560                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2561                         err = -ETIME;
2562                 i915_request_put(rq);
2563                 if (err)
2564                         break;
2565
2566                 if (++idx == ps->nengines)
2567                         idx = 0;
2568         } while (!__igt_timeout(end_time, NULL));
2569
2570         return err;
2571 }
2572
2573 static int s_sync1(void *arg)
2574 {
2575         struct perf_series *ps = arg;
2576         struct i915_request *prev = NULL;
2577         IGT_TIMEOUT(end_time);
2578         unsigned int idx = 0;
2579         int err = 0;
2580
2581         GEM_BUG_ON(!ps->nengines);
2582         do {
2583                 struct i915_request *rq;
2584
2585                 rq = i915_request_create(ps->ce[idx]);
2586                 if (IS_ERR(rq)) {
2587                         err = PTR_ERR(rq);
2588                         break;
2589                 }
2590
2591                 i915_request_get(rq);
2592                 i915_request_add(rq);
2593
2594                 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2595                         err = -ETIME;
2596                 i915_request_put(prev);
2597                 prev = rq;
2598                 if (err)
2599                         break;
2600
2601                 if (++idx == ps->nengines)
2602                         idx = 0;
2603         } while (!__igt_timeout(end_time, NULL));
2604         i915_request_put(prev);
2605
2606         return err;
2607 }
2608
2609 static int s_many(void *arg)
2610 {
2611         struct perf_series *ps = arg;
2612         IGT_TIMEOUT(end_time);
2613         unsigned int idx = 0;
2614
2615         GEM_BUG_ON(!ps->nengines);
2616         do {
2617                 struct i915_request *rq;
2618
2619                 rq = i915_request_create(ps->ce[idx]);
2620                 if (IS_ERR(rq))
2621                         return PTR_ERR(rq);
2622
2623                 i915_request_add(rq);
2624
2625                 if (++idx == ps->nengines)
2626                         idx = 0;
2627         } while (!__igt_timeout(end_time, NULL));
2628
2629         return 0;
2630 }
2631
2632 static int perf_series_engines(void *arg)
2633 {
2634         struct drm_i915_private *i915 = arg;
2635         static int (* const func[])(void *arg) = {
2636                 s_sync0,
2637                 s_sync1,
2638                 s_many,
2639                 NULL,
2640         };
2641         const unsigned int nengines = num_uabi_engines(i915);
2642         struct intel_engine_cs *engine;
2643         int (* const *fn)(void *arg);
2644         struct pm_qos_request qos;
2645         struct perf_stats *stats;
2646         struct perf_series *ps;
2647         unsigned int idx;
2648         int err = 0;
2649
2650         stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2651         if (!stats)
2652                 return -ENOMEM;
2653
2654         ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2655         if (!ps) {
2656                 kfree(stats);
2657                 return -ENOMEM;
2658         }
2659
2660         cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2661
2662         ps->i915 = i915;
2663         ps->nengines = nengines;
2664
2665         idx = 0;
2666         for_each_uabi_engine(engine, i915) {
2667                 struct intel_context *ce;
2668
2669                 ce = intel_context_create(engine);
2670                 if (IS_ERR(ce)) {
2671                         err = PTR_ERR(ce);
2672                         goto out;
2673                 }
2674
2675                 err = intel_context_pin(ce);
2676                 if (err) {
2677                         intel_context_put(ce);
2678                         goto out;
2679                 }
2680
2681                 ps->ce[idx++] = ce;
2682         }
2683         GEM_BUG_ON(idx != ps->nengines);
2684
2685         for (fn = func; *fn && !err; fn++) {
2686                 char name[KSYM_NAME_LEN];
2687                 struct igt_live_test t;
2688
2689                 snprintf(name, sizeof(name), "%ps", *fn);
2690                 err = igt_live_test_begin(&t, i915, __func__, name);
2691                 if (err)
2692                         break;
2693
2694                 for (idx = 0; idx < nengines; idx++) {
2695                         struct perf_stats *p =
2696                                 memset(&stats[idx], 0, sizeof(stats[idx]));
2697                         struct intel_context *ce = ps->ce[idx];
2698
2699                         p->engine = ps->ce[idx]->engine;
2700                         intel_engine_pm_get(p->engine);
2701
2702                         if (intel_engine_supports_stats(p->engine))
2703                                 p->busy = intel_engine_get_busy_time(p->engine,
2704                                                                      &p->time) + 1;
2705                         else
2706                                 p->time = ktime_get();
2707                         p->runtime = -intel_context_get_total_runtime_ns(ce);
2708                 }
2709
2710                 err = (*fn)(ps);
2711                 if (igt_live_test_end(&t))
2712                         err = -EIO;
2713
2714                 for (idx = 0; idx < nengines; idx++) {
2715                         struct perf_stats *p = &stats[idx];
2716                         struct intel_context *ce = ps->ce[idx];
2717                         int integer, decimal;
2718                         u64 busy, dt, now;
2719
2720                         if (p->busy)
2721                                 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2722                                                                                &now),
2723                                                     p->busy - 1);
2724                         else
2725                                 now = ktime_get();
2726                         p->time = ktime_sub(now, p->time);
2727
2728                         err = switch_to_kernel_sync(ce, err);
2729                         p->runtime += intel_context_get_total_runtime_ns(ce);
2730                         intel_engine_pm_put(p->engine);
2731
2732                         busy = 100 * ktime_to_ns(p->busy);
2733                         dt = ktime_to_ns(p->time);
2734                         if (dt) {
2735                                 integer = div64_u64(busy, dt);
2736                                 busy -= integer * dt;
2737                                 decimal = div64_u64(100 * busy, dt);
2738                         } else {
2739                                 integer = 0;
2740                                 decimal = 0;
2741                         }
2742
2743                         pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2744                                 name, p->engine->name, ce->timeline->seqno,
2745                                 integer, decimal,
2746                                 div_u64(p->runtime, 1000 * 1000),
2747                                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2748                 }
2749         }
2750
2751 out:
2752         for (idx = 0; idx < nengines; idx++) {
2753                 if (IS_ERR_OR_NULL(ps->ce[idx]))
2754                         break;
2755
2756                 intel_context_unpin(ps->ce[idx]);
2757                 intel_context_put(ps->ce[idx]);
2758         }
2759         kfree(ps);
2760
2761         cpu_latency_qos_remove_request(&qos);
2762         kfree(stats);
2763         return err;
2764 }
2765
2766 static int p_sync0(void *arg)
2767 {
2768         struct perf_stats *p = arg;
2769         struct intel_engine_cs *engine = p->engine;
2770         struct intel_context *ce;
2771         IGT_TIMEOUT(end_time);
2772         unsigned long count;
2773         bool busy;
2774         int err = 0;
2775
2776         ce = intel_context_create(engine);
2777         if (IS_ERR(ce))
2778                 return PTR_ERR(ce);
2779
2780         err = intel_context_pin(ce);
2781         if (err) {
2782                 intel_context_put(ce);
2783                 return err;
2784         }
2785
2786         if (intel_engine_supports_stats(engine)) {
2787                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2788                 busy = true;
2789         } else {
2790                 p->time = ktime_get();
2791                 busy = false;
2792         }
2793
2794         count = 0;
2795         do {
2796                 struct i915_request *rq;
2797
2798                 rq = i915_request_create(ce);
2799                 if (IS_ERR(rq)) {
2800                         err = PTR_ERR(rq);
2801                         break;
2802                 }
2803
2804                 i915_request_get(rq);
2805                 i915_request_add(rq);
2806
2807                 err = 0;
2808                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2809                         err = -ETIME;
2810                 i915_request_put(rq);
2811                 if (err)
2812                         break;
2813
2814                 count++;
2815         } while (!__igt_timeout(end_time, NULL));
2816
2817         if (busy) {
2818                 ktime_t now;
2819
2820                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2821                                     p->busy);
2822                 p->time = ktime_sub(now, p->time);
2823         } else {
2824                 p->time = ktime_sub(ktime_get(), p->time);
2825         }
2826
2827         err = switch_to_kernel_sync(ce, err);
2828         p->runtime = intel_context_get_total_runtime_ns(ce);
2829         p->count = count;
2830
2831         intel_context_unpin(ce);
2832         intel_context_put(ce);
2833         return err;
2834 }
2835
2836 static int p_sync1(void *arg)
2837 {
2838         struct perf_stats *p = arg;
2839         struct intel_engine_cs *engine = p->engine;
2840         struct i915_request *prev = NULL;
2841         struct intel_context *ce;
2842         IGT_TIMEOUT(end_time);
2843         unsigned long count;
2844         bool busy;
2845         int err = 0;
2846
2847         ce = intel_context_create(engine);
2848         if (IS_ERR(ce))
2849                 return PTR_ERR(ce);
2850
2851         err = intel_context_pin(ce);
2852         if (err) {
2853                 intel_context_put(ce);
2854                 return err;
2855         }
2856
2857         if (intel_engine_supports_stats(engine)) {
2858                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2859                 busy = true;
2860         } else {
2861                 p->time = ktime_get();
2862                 busy = false;
2863         }
2864
2865         count = 0;
2866         do {
2867                 struct i915_request *rq;
2868
2869                 rq = i915_request_create(ce);
2870                 if (IS_ERR(rq)) {
2871                         err = PTR_ERR(rq);
2872                         break;
2873                 }
2874
2875                 i915_request_get(rq);
2876                 i915_request_add(rq);
2877
2878                 err = 0;
2879                 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2880                         err = -ETIME;
2881                 i915_request_put(prev);
2882                 prev = rq;
2883                 if (err)
2884                         break;
2885
2886                 count++;
2887         } while (!__igt_timeout(end_time, NULL));
2888         i915_request_put(prev);
2889
2890         if (busy) {
2891                 ktime_t now;
2892
2893                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2894                                     p->busy);
2895                 p->time = ktime_sub(now, p->time);
2896         } else {
2897                 p->time = ktime_sub(ktime_get(), p->time);
2898         }
2899
2900         err = switch_to_kernel_sync(ce, err);
2901         p->runtime = intel_context_get_total_runtime_ns(ce);
2902         p->count = count;
2903
2904         intel_context_unpin(ce);
2905         intel_context_put(ce);
2906         return err;
2907 }
2908
2909 static int p_many(void *arg)
2910 {
2911         struct perf_stats *p = arg;
2912         struct intel_engine_cs *engine = p->engine;
2913         struct intel_context *ce;
2914         IGT_TIMEOUT(end_time);
2915         unsigned long count;
2916         int err = 0;
2917         bool busy;
2918
2919         ce = intel_context_create(engine);
2920         if (IS_ERR(ce))
2921                 return PTR_ERR(ce);
2922
2923         err = intel_context_pin(ce);
2924         if (err) {
2925                 intel_context_put(ce);
2926                 return err;
2927         }
2928
2929         if (intel_engine_supports_stats(engine)) {
2930                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2931                 busy = true;
2932         } else {
2933                 p->time = ktime_get();
2934                 busy = false;
2935         }
2936
2937         count = 0;
2938         do {
2939                 struct i915_request *rq;
2940
2941                 rq = i915_request_create(ce);
2942                 if (IS_ERR(rq)) {
2943                         err = PTR_ERR(rq);
2944                         break;
2945                 }
2946
2947                 i915_request_add(rq);
2948                 count++;
2949         } while (!__igt_timeout(end_time, NULL));
2950
2951         if (busy) {
2952                 ktime_t now;
2953
2954                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2955                                     p->busy);
2956                 p->time = ktime_sub(now, p->time);
2957         } else {
2958                 p->time = ktime_sub(ktime_get(), p->time);
2959         }
2960
2961         err = switch_to_kernel_sync(ce, err);
2962         p->runtime = intel_context_get_total_runtime_ns(ce);
2963         p->count = count;
2964
2965         intel_context_unpin(ce);
2966         intel_context_put(ce);
2967         return err;
2968 }
2969
2970 static int perf_parallel_engines(void *arg)
2971 {
2972         struct drm_i915_private *i915 = arg;
2973         static int (* const func[])(void *arg) = {
2974                 p_sync0,
2975                 p_sync1,
2976                 p_many,
2977                 NULL,
2978         };
2979         const unsigned int nengines = num_uabi_engines(i915);
2980         struct intel_engine_cs *engine;
2981         int (* const *fn)(void *arg);
2982         struct pm_qos_request qos;
2983         struct {
2984                 struct perf_stats p;
2985                 struct task_struct *tsk;
2986         } *engines;
2987         int err = 0;
2988
2989         engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2990         if (!engines)
2991                 return -ENOMEM;
2992
2993         cpu_latency_qos_add_request(&qos, 0);
2994
2995         for (fn = func; *fn; fn++) {
2996                 char name[KSYM_NAME_LEN];
2997                 struct igt_live_test t;
2998                 unsigned int idx;
2999
3000                 snprintf(name, sizeof(name), "%ps", *fn);
3001                 err = igt_live_test_begin(&t, i915, __func__, name);
3002                 if (err)
3003                         break;
3004
3005                 atomic_set(&i915->selftest.counter, nengines);
3006
3007                 idx = 0;
3008                 for_each_uabi_engine(engine, i915) {
3009                         intel_engine_pm_get(engine);
3010
3011                         memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3012                         engines[idx].p.engine = engine;
3013
3014                         engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3015                                                        "igt:%s", engine->name);
3016                         if (IS_ERR(engines[idx].tsk)) {
3017                                 err = PTR_ERR(engines[idx].tsk);
3018                                 intel_engine_pm_put(engine);
3019                                 break;
3020                         }
3021                         get_task_struct(engines[idx++].tsk);
3022                 }
3023
3024                 yield(); /* start all threads before we kthread_stop() */
3025
3026                 idx = 0;
3027                 for_each_uabi_engine(engine, i915) {
3028                         int status;
3029
3030                         if (IS_ERR(engines[idx].tsk))
3031                                 break;
3032
3033                         status = kthread_stop(engines[idx].tsk);
3034                         if (status && !err)
3035                                 err = status;
3036
3037                         intel_engine_pm_put(engine);
3038                         put_task_struct(engines[idx++].tsk);
3039                 }
3040
3041                 if (igt_live_test_end(&t))
3042                         err = -EIO;
3043                 if (err)
3044                         break;
3045
3046                 idx = 0;
3047                 for_each_uabi_engine(engine, i915) {
3048                         struct perf_stats *p = &engines[idx].p;
3049                         u64 busy = 100 * ktime_to_ns(p->busy);
3050                         u64 dt = ktime_to_ns(p->time);
3051                         int integer, decimal;
3052
3053                         if (dt) {
3054                                 integer = div64_u64(busy, dt);
3055                                 busy -= integer * dt;
3056                                 decimal = div64_u64(100 * busy, dt);
3057                         } else {
3058                                 integer = 0;
3059                                 decimal = 0;
3060                         }
3061
3062                         GEM_BUG_ON(engine != p->engine);
3063                         pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3064                                 name, engine->name, p->count, integer, decimal,
3065                                 div_u64(p->runtime, 1000 * 1000),
3066                                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
3067                         idx++;
3068                 }
3069         }
3070
3071         cpu_latency_qos_remove_request(&qos);
3072         kfree(engines);
3073         return err;
3074 }
3075
3076 int i915_request_perf_selftests(struct drm_i915_private *i915)
3077 {
3078         static const struct i915_subtest tests[] = {
3079                 SUBTEST(perf_request_latency),
3080                 SUBTEST(perf_series_engines),
3081                 SUBTEST(perf_parallel_engines),
3082         };
3083
3084         if (intel_gt_is_wedged(&i915->gt))
3085                 return 0;
3086
3087         return i915_subtests(tests, i915);
3088 }