2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_clock_utils.h"
37 #include "gt/intel_gt_requests.h"
38 #include "gt/selftest_engine_heartbeat.h"
40 #include "i915_random.h"
41 #include "i915_selftest.h"
42 #include "igt_flush_test.h"
43 #include "igt_live_test.h"
44 #include "igt_spinner.h"
45 #include "lib_sw_fence.h"
48 #include "mock_gem_device.h"
50 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
52 struct intel_engine_cs *engine;
56 for_each_uabi_engine(engine, i915)
62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
64 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
67 static int igt_add_request(void *arg)
69 struct drm_i915_private *i915 = arg;
70 struct i915_request *request;
72 /* Basic preliminary test to create a request and let it loose! */
74 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
78 i915_request_add(request);
83 static int igt_wait_request(void *arg)
85 const long T = HZ / 4;
86 struct drm_i915_private *i915 = arg;
87 struct i915_request *request;
90 /* Submit a request, then wait upon it */
92 request = mock_request(rcs0(i915)->kernel_context, T);
96 i915_request_get(request);
98 if (i915_request_wait(request, 0, 0) != -ETIME) {
99 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
103 if (i915_request_wait(request, 0, T) != -ETIME) {
104 pr_err("request wait succeeded (expected timeout before submit!)\n");
108 if (i915_request_completed(request)) {
109 pr_err("request completed before submit!!\n");
113 i915_request_add(request);
115 if (i915_request_wait(request, 0, 0) != -ETIME) {
116 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
120 if (i915_request_completed(request)) {
121 pr_err("request completed immediately!\n");
125 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
126 pr_err("request wait succeeded (expected timeout!)\n");
130 if (i915_request_wait(request, 0, T) == -ETIME) {
131 pr_err("request wait timed out!\n");
135 if (!i915_request_completed(request)) {
136 pr_err("request not complete after waiting!\n");
140 if (i915_request_wait(request, 0, T) == -ETIME) {
141 pr_err("request wait timed out when already complete!\n");
147 i915_request_put(request);
148 mock_device_flush(i915);
152 static int igt_fence_wait(void *arg)
154 const long T = HZ / 4;
155 struct drm_i915_private *i915 = arg;
156 struct i915_request *request;
159 /* Submit a request, treat it as a fence and wait upon it */
161 request = mock_request(rcs0(i915)->kernel_context, T);
165 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
166 pr_err("fence wait success before submit (expected timeout)!\n");
170 i915_request_add(request);
172 if (dma_fence_is_signaled(&request->fence)) {
173 pr_err("fence signaled immediately!\n");
177 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
178 pr_err("fence wait success after submit (expected timeout)!\n");
182 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
183 pr_err("fence wait timed out (expected success)!\n");
187 if (!dma_fence_is_signaled(&request->fence)) {
188 pr_err("fence unsignaled after waiting!\n");
192 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
193 pr_err("fence wait timed out when complete (expected success)!\n");
199 mock_device_flush(i915);
203 static int igt_request_rewind(void *arg)
205 struct drm_i915_private *i915 = arg;
206 struct i915_request *request, *vip;
207 struct i915_gem_context *ctx[2];
208 struct intel_context *ce;
211 ctx[0] = mock_context(i915, "A");
213 ce = i915_gem_context_get_engine(ctx[0], RCS0);
214 GEM_BUG_ON(IS_ERR(ce));
215 request = mock_request(ce, 2 * HZ);
216 intel_context_put(ce);
222 i915_request_get(request);
223 i915_request_add(request);
225 ctx[1] = mock_context(i915, "B");
227 ce = i915_gem_context_get_engine(ctx[1], RCS0);
228 GEM_BUG_ON(IS_ERR(ce));
229 vip = mock_request(ce, 0);
230 intel_context_put(ce);
236 /* Simulate preemption by manual reordering */
237 if (!mock_cancel_request(request)) {
238 pr_err("failed to cancel request (already executed)!\n");
239 i915_request_add(vip);
242 i915_request_get(vip);
243 i915_request_add(vip);
245 request->engine->submit_request(request);
249 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
250 pr_err("timed out waiting for high priority request\n");
254 if (i915_request_completed(request)) {
255 pr_err("low priority request already completed\n");
261 i915_request_put(vip);
263 mock_context_close(ctx[1]);
264 i915_request_put(request);
266 mock_context_close(ctx[0]);
267 mock_device_flush(i915);
272 struct intel_engine_cs *engine;
273 struct i915_gem_context **contexts;
274 atomic_long_t num_waits, num_fences;
275 int ncontexts, max_batch;
276 struct i915_request *(*request_alloc)(struct intel_context *ce);
279 static struct i915_request *
280 __mock_request_alloc(struct intel_context *ce)
282 return mock_request(ce, 0);
285 static struct i915_request *
286 __live_request_alloc(struct intel_context *ce)
288 return intel_context_create_request(ce);
291 static int __igt_breadcrumbs_smoketest(void *arg)
293 struct smoketest *t = arg;
294 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
295 const unsigned int total = 4 * t->ncontexts + 1;
296 unsigned int num_waits = 0, num_fences = 0;
297 struct i915_request **requests;
298 I915_RND_STATE(prng);
303 * A very simple test to catch the most egregious of list handling bugs.
305 * At its heart, we simply create oodles of requests running across
306 * multiple kthreads and enable signaling on them, for the sole purpose
307 * of stressing our breadcrumb handling. The only inspection we do is
308 * that the fences were marked as signaled.
311 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
315 order = i915_random_order(total, &prng);
321 while (!kthread_should_stop()) {
322 struct i915_sw_fence *submit, *wait;
323 unsigned int n, count;
325 submit = heap_fence_create(GFP_KERNEL);
331 wait = heap_fence_create(GFP_KERNEL);
333 i915_sw_fence_commit(submit);
334 heap_fence_put(submit);
339 i915_random_reorder(order, total, &prng);
340 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
342 for (n = 0; n < count; n++) {
343 struct i915_gem_context *ctx =
344 t->contexts[order[n] % t->ncontexts];
345 struct i915_request *rq;
346 struct intel_context *ce;
348 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
349 GEM_BUG_ON(IS_ERR(ce));
350 rq = t->request_alloc(ce);
351 intel_context_put(ce);
358 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
362 requests[n] = i915_request_get(rq);
363 i915_request_add(rq);
366 err = i915_sw_fence_await_dma_fence(wait,
372 i915_request_put(rq);
378 i915_sw_fence_commit(submit);
379 i915_sw_fence_commit(wait);
381 if (!wait_event_timeout(wait->wait,
382 i915_sw_fence_done(wait),
384 struct i915_request *rq = requests[count - 1];
386 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
387 atomic_read(&wait->pending), count,
388 rq->fence.context, rq->fence.seqno,
392 intel_gt_set_wedged(t->engine->gt);
393 GEM_BUG_ON(!i915_request_completed(rq));
394 i915_sw_fence_wait(wait);
398 for (n = 0; n < count; n++) {
399 struct i915_request *rq = requests[n];
401 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
403 pr_err("%llu:%llu was not signaled!\n",
404 rq->fence.context, rq->fence.seqno);
408 i915_request_put(rq);
411 heap_fence_put(wait);
412 heap_fence_put(submit);
423 atomic_long_add(num_fences, &t->num_fences);
424 atomic_long_add(num_waits, &t->num_waits);
432 static int mock_breadcrumbs_smoketest(void *arg)
434 struct drm_i915_private *i915 = arg;
435 struct smoketest t = {
436 .engine = rcs0(i915),
439 .request_alloc = __mock_request_alloc
441 unsigned int ncpus = num_online_cpus();
442 struct task_struct **threads;
447 * Smoketest our breadcrumb/signal handling for requests across multiple
448 * threads. A very simple test to only catch the most egregious of bugs.
449 * See __igt_breadcrumbs_smoketest();
452 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
456 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
462 for (n = 0; n < t.ncontexts; n++) {
463 t.contexts[n] = mock_context(t.engine->i915, "mock");
464 if (!t.contexts[n]) {
470 for (n = 0; n < ncpus; n++) {
471 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
473 if (IS_ERR(threads[n])) {
474 ret = PTR_ERR(threads[n]);
479 get_task_struct(threads[n]);
482 yield(); /* start all threads before we begin */
483 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
485 for (n = 0; n < ncpus; n++) {
488 err = kthread_stop(threads[n]);
492 put_task_struct(threads[n]);
494 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
495 atomic_long_read(&t.num_waits),
496 atomic_long_read(&t.num_fences),
500 for (n = 0; n < t.ncontexts; n++) {
503 mock_context_close(t.contexts[n]);
511 int i915_request_mock_selftests(void)
513 static const struct i915_subtest tests[] = {
514 SUBTEST(igt_add_request),
515 SUBTEST(igt_wait_request),
516 SUBTEST(igt_fence_wait),
517 SUBTEST(igt_request_rewind),
518 SUBTEST(mock_breadcrumbs_smoketest),
520 struct drm_i915_private *i915;
521 intel_wakeref_t wakeref;
524 i915 = mock_gem_device();
528 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
529 err = i915_subtests(tests, i915);
531 mock_destroy_device(i915);
536 static int live_nop_request(void *arg)
538 struct drm_i915_private *i915 = arg;
539 struct intel_engine_cs *engine;
540 struct igt_live_test t;
544 * Submit various sized batches of empty requests, to each engine
545 * (individually), and wait for the batch to complete. We can check
546 * the overhead of submitting requests to the hardware.
549 for_each_uabi_engine(engine, i915) {
550 unsigned long n, prime;
551 IGT_TIMEOUT(end_time);
552 ktime_t times[2] = {};
554 err = igt_live_test_begin(&t, i915, __func__, engine->name);
558 intel_engine_pm_get(engine);
559 for_each_prime_number_from(prime, 1, 8192) {
560 struct i915_request *request = NULL;
562 times[1] = ktime_get_raw();
564 for (n = 0; n < prime; n++) {
565 i915_request_put(request);
566 request = i915_request_create(engine->kernel_context);
568 return PTR_ERR(request);
571 * This space is left intentionally blank.
573 * We do not actually want to perform any
574 * action with this request, we just want
575 * to measure the latency in allocation
576 * and submission of our breadcrumbs -
577 * ensuring that the bare request is sufficient
578 * for the system to work (i.e. proper HEAD
579 * tracking of the rings, interrupt handling,
580 * etc). It also gives us the lowest bounds
584 i915_request_get(request);
585 i915_request_add(request);
587 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
588 i915_request_put(request);
590 times[1] = ktime_sub(ktime_get_raw(), times[1]);
594 if (__igt_timeout(end_time, NULL))
597 intel_engine_pm_put(engine);
599 err = igt_live_test_end(&t);
603 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
605 ktime_to_ns(times[0]),
606 prime, div64_u64(ktime_to_ns(times[1]), prime));
612 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
614 struct drm_i915_gem_object *obj;
615 struct i915_vma *vma;
619 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
621 return ERR_CAST(obj);
623 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB);
629 *cmd = MI_BATCH_BUFFER_END;
631 __i915_gem_object_flush_map(obj, 0, 64);
632 i915_gem_object_unpin_map(obj);
634 intel_gt_chipset_flush(&i915->gt);
636 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
642 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
646 /* Force the wait wait now to avoid including it in the benchmark */
647 err = i915_vma_sync(vma);
656 i915_gem_object_put(obj);
660 static struct i915_request *
661 empty_request(struct intel_engine_cs *engine,
662 struct i915_vma *batch)
664 struct i915_request *request;
667 request = i915_request_create(engine->kernel_context);
671 err = engine->emit_bb_start(request,
674 I915_DISPATCH_SECURE);
678 i915_request_get(request);
680 i915_request_add(request);
681 return err ? ERR_PTR(err) : request;
684 static int live_empty_request(void *arg)
686 struct drm_i915_private *i915 = arg;
687 struct intel_engine_cs *engine;
688 struct igt_live_test t;
689 struct i915_vma *batch;
693 * Submit various sized batches of empty requests, to each engine
694 * (individually), and wait for the batch to complete. We can check
695 * the overhead of submitting requests to the hardware.
698 batch = empty_batch(i915);
700 return PTR_ERR(batch);
702 for_each_uabi_engine(engine, i915) {
703 IGT_TIMEOUT(end_time);
704 struct i915_request *request;
705 unsigned long n, prime;
706 ktime_t times[2] = {};
708 err = igt_live_test_begin(&t, i915, __func__, engine->name);
712 intel_engine_pm_get(engine);
714 /* Warmup / preload */
715 request = empty_request(engine, batch);
716 if (IS_ERR(request)) {
717 err = PTR_ERR(request);
718 intel_engine_pm_put(engine);
721 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
723 for_each_prime_number_from(prime, 1, 8192) {
724 times[1] = ktime_get_raw();
726 for (n = 0; n < prime; n++) {
727 i915_request_put(request);
728 request = empty_request(engine, batch);
729 if (IS_ERR(request)) {
730 err = PTR_ERR(request);
731 intel_engine_pm_put(engine);
735 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
737 times[1] = ktime_sub(ktime_get_raw(), times[1]);
741 if (__igt_timeout(end_time, NULL))
744 i915_request_put(request);
745 intel_engine_pm_put(engine);
747 err = igt_live_test_end(&t);
751 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
753 ktime_to_ns(times[0]),
754 prime, div64_u64(ktime_to_ns(times[1]), prime));
758 i915_vma_unpin(batch);
763 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
765 struct drm_i915_gem_object *obj;
766 const int gen = INTEL_GEN(i915);
767 struct i915_vma *vma;
771 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
773 return ERR_CAST(obj);
775 vma = i915_vma_instance(obj, i915->gt.vm, NULL);
781 err = i915_vma_pin(vma, 0, 0, PIN_USER);
785 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC);
792 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
793 *cmd++ = lower_32_bits(vma->node.start);
794 *cmd++ = upper_32_bits(vma->node.start);
795 } else if (gen >= 6) {
796 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
797 *cmd++ = lower_32_bits(vma->node.start);
799 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
800 *cmd++ = lower_32_bits(vma->node.start);
802 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
804 __i915_gem_object_flush_map(obj, 0, 64);
805 i915_gem_object_unpin_map(obj);
807 intel_gt_chipset_flush(&i915->gt);
812 i915_gem_object_put(obj);
816 static int recursive_batch_resolve(struct i915_vma *batch)
820 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC);
824 *cmd = MI_BATCH_BUFFER_END;
826 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
827 i915_gem_object_unpin_map(batch->obj);
829 intel_gt_chipset_flush(batch->vm->gt);
834 static int live_all_engines(void *arg)
836 struct drm_i915_private *i915 = arg;
837 const unsigned int nengines = num_uabi_engines(i915);
838 struct intel_engine_cs *engine;
839 struct i915_request **request;
840 struct igt_live_test t;
841 struct i915_vma *batch;
846 * Check we can submit requests to all engines simultaneously. We
847 * send a recursive batch to each engine - checking that we don't
848 * block doing so, and that they don't complete too soon.
851 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
855 err = igt_live_test_begin(&t, i915, __func__, "");
859 batch = recursive_batch(i915);
861 err = PTR_ERR(batch);
862 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
866 i915_vma_lock(batch);
869 for_each_uabi_engine(engine, i915) {
870 request[idx] = intel_engine_create_kernel_request(engine);
871 if (IS_ERR(request[idx])) {
872 err = PTR_ERR(request[idx]);
873 pr_err("%s: Request allocation failed with err=%d\n",
878 err = i915_request_await_object(request[idx], batch->obj, 0);
880 err = i915_vma_move_to_active(batch, request[idx], 0);
883 err = engine->emit_bb_start(request[idx],
888 request[idx]->batch = batch;
890 i915_request_get(request[idx]);
891 i915_request_add(request[idx]);
895 i915_vma_unlock(batch);
898 for_each_uabi_engine(engine, i915) {
899 if (i915_request_completed(request[idx])) {
900 pr_err("%s(%s): request completed too early!\n",
901 __func__, engine->name);
908 err = recursive_batch_resolve(batch);
910 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
915 for_each_uabi_engine(engine, i915) {
918 timeout = i915_request_wait(request[idx], 0,
919 MAX_SCHEDULE_TIMEOUT);
922 pr_err("%s: error waiting for request on %s, err=%d\n",
923 __func__, engine->name, err);
927 GEM_BUG_ON(!i915_request_completed(request[idx]));
928 i915_request_put(request[idx]);
933 err = igt_live_test_end(&t);
937 for_each_uabi_engine(engine, i915) {
939 i915_request_put(request[idx]);
942 i915_vma_unpin(batch);
949 static int live_sequential_engines(void *arg)
951 struct drm_i915_private *i915 = arg;
952 const unsigned int nengines = num_uabi_engines(i915);
953 struct i915_request **request;
954 struct i915_request *prev = NULL;
955 struct intel_engine_cs *engine;
956 struct igt_live_test t;
961 * Check we can submit requests to all engines sequentially, such
962 * that each successive request waits for the earlier ones. This
963 * tests that we don't execute requests out of order, even though
964 * they are running on independent engines.
967 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
971 err = igt_live_test_begin(&t, i915, __func__, "");
976 for_each_uabi_engine(engine, i915) {
977 struct i915_vma *batch;
979 batch = recursive_batch(i915);
981 err = PTR_ERR(batch);
982 pr_err("%s: Unable to create batch for %s, err=%d\n",
983 __func__, engine->name, err);
987 i915_vma_lock(batch);
988 request[idx] = intel_engine_create_kernel_request(engine);
989 if (IS_ERR(request[idx])) {
990 err = PTR_ERR(request[idx]);
991 pr_err("%s: Request allocation failed for %s with err=%d\n",
992 __func__, engine->name, err);
997 err = i915_request_await_dma_fence(request[idx],
1000 i915_request_add(request[idx]);
1001 pr_err("%s: Request await failed for %s with err=%d\n",
1002 __func__, engine->name, err);
1007 err = i915_request_await_object(request[idx],
1010 err = i915_vma_move_to_active(batch, request[idx], 0);
1013 err = engine->emit_bb_start(request[idx],
1018 request[idx]->batch = batch;
1020 i915_request_get(request[idx]);
1021 i915_request_add(request[idx]);
1023 prev = request[idx];
1027 i915_vma_unlock(batch);
1033 for_each_uabi_engine(engine, i915) {
1036 if (i915_request_completed(request[idx])) {
1037 pr_err("%s(%s): request completed too early!\n",
1038 __func__, engine->name);
1043 err = recursive_batch_resolve(request[idx]->batch);
1045 pr_err("%s: failed to resolve batch, err=%d\n",
1050 timeout = i915_request_wait(request[idx], 0,
1051 MAX_SCHEDULE_TIMEOUT);
1054 pr_err("%s: error waiting for request on %s, err=%d\n",
1055 __func__, engine->name, err);
1059 GEM_BUG_ON(!i915_request_completed(request[idx]));
1063 err = igt_live_test_end(&t);
1067 for_each_uabi_engine(engine, i915) {
1073 cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
1076 *cmd = MI_BATCH_BUFFER_END;
1078 __i915_gem_object_flush_map(request[idx]->batch->obj,
1080 i915_gem_object_unpin_map(request[idx]->batch->obj);
1082 intel_gt_chipset_flush(engine->gt);
1085 i915_vma_put(request[idx]->batch);
1086 i915_request_put(request[idx]);
1094 static int __live_parallel_engine1(void *arg)
1096 struct intel_engine_cs *engine = arg;
1097 IGT_TIMEOUT(end_time);
1098 unsigned long count;
1102 intel_engine_pm_get(engine);
1104 struct i915_request *rq;
1106 rq = i915_request_create(engine->kernel_context);
1112 i915_request_get(rq);
1113 i915_request_add(rq);
1116 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1118 i915_request_put(rq);
1123 } while (!__igt_timeout(end_time, NULL));
1124 intel_engine_pm_put(engine);
1126 pr_info("%s: %lu request + sync\n", engine->name, count);
1130 static int __live_parallel_engineN(void *arg)
1132 struct intel_engine_cs *engine = arg;
1133 IGT_TIMEOUT(end_time);
1134 unsigned long count;
1138 intel_engine_pm_get(engine);
1140 struct i915_request *rq;
1142 rq = i915_request_create(engine->kernel_context);
1148 i915_request_add(rq);
1150 } while (!__igt_timeout(end_time, NULL));
1151 intel_engine_pm_put(engine);
1153 pr_info("%s: %lu requests\n", engine->name, count);
1157 static bool wake_all(struct drm_i915_private *i915)
1159 if (atomic_dec_and_test(&i915->selftest.counter)) {
1160 wake_up_var(&i915->selftest.counter);
1167 static int wait_for_all(struct drm_i915_private *i915)
1172 if (wait_var_event_timeout(&i915->selftest.counter,
1173 !atomic_read(&i915->selftest.counter),
1174 i915_selftest.timeout_jiffies))
1180 static int __live_parallel_spin(void *arg)
1182 struct intel_engine_cs *engine = arg;
1183 struct igt_spinner spin;
1184 struct i915_request *rq;
1188 * Create a spinner running for eternity on each engine. If a second
1189 * spinner is incorrectly placed on the same engine, it will not be
1190 * able to start in time.
1193 if (igt_spinner_init(&spin, engine->gt)) {
1194 wake_all(engine->i915);
1198 intel_engine_pm_get(engine);
1199 rq = igt_spinner_create_request(&spin,
1200 engine->kernel_context,
1201 MI_NOOP); /* no preemption */
1202 intel_engine_pm_put(engine);
1207 wake_all(engine->i915);
1211 i915_request_get(rq);
1212 i915_request_add(rq);
1213 if (igt_wait_for_spinner(&spin, rq)) {
1214 /* Occupy this engine for the whole test */
1215 err = wait_for_all(engine->i915);
1217 pr_err("Failed to start spinner on %s\n", engine->name);
1220 igt_spinner_end(&spin);
1222 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1224 i915_request_put(rq);
1227 igt_spinner_fini(&spin);
1231 static int live_parallel_engines(void *arg)
1233 struct drm_i915_private *i915 = arg;
1234 static int (* const func[])(void *arg) = {
1235 __live_parallel_engine1,
1236 __live_parallel_engineN,
1237 __live_parallel_spin,
1240 const unsigned int nengines = num_uabi_engines(i915);
1241 struct intel_engine_cs *engine;
1242 int (* const *fn)(void *arg);
1243 struct task_struct **tsk;
1247 * Check we can submit requests to all engines concurrently. This
1248 * tests that we load up the system maximally.
1251 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1255 for (fn = func; !err && *fn; fn++) {
1256 char name[KSYM_NAME_LEN];
1257 struct igt_live_test t;
1260 snprintf(name, sizeof(name), "%ps", *fn);
1261 err = igt_live_test_begin(&t, i915, __func__, name);
1265 atomic_set(&i915->selftest.counter, nengines);
1268 for_each_uabi_engine(engine, i915) {
1269 tsk[idx] = kthread_run(*fn, engine,
1272 if (IS_ERR(tsk[idx])) {
1273 err = PTR_ERR(tsk[idx]);
1276 get_task_struct(tsk[idx++]);
1279 yield(); /* start all threads before we kthread_stop() */
1282 for_each_uabi_engine(engine, i915) {
1285 if (IS_ERR(tsk[idx]))
1288 status = kthread_stop(tsk[idx]);
1292 put_task_struct(tsk[idx++]);
1295 if (igt_live_test_end(&t))
1304 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1306 struct i915_request *rq;
1310 * Before execlists, all contexts share the same ringbuffer. With
1311 * execlists, each context/engine has a separate ringbuffer and
1312 * for the purposes of this test, inexhaustible.
1314 * For the global ringbuffer though, we have to be very careful
1315 * that we do not wrap while preventing the execution of requests
1316 * with a unsignaled fence.
1318 if (HAS_EXECLISTS(ctx->i915))
1321 rq = igt_request_alloc(ctx, engine);
1327 ret = rq->ring->size - rq->reserved_space;
1328 i915_request_add(rq);
1330 sz = rq->ring->emit - rq->head;
1332 sz += rq->ring->size;
1334 ret /= 2; /* leave half spare, in case of emergency! */
1340 static int live_breadcrumbs_smoketest(void *arg)
1342 struct drm_i915_private *i915 = arg;
1343 const unsigned int nengines = num_uabi_engines(i915);
1344 const unsigned int ncpus = num_online_cpus();
1345 unsigned long num_waits, num_fences;
1346 struct intel_engine_cs *engine;
1347 struct task_struct **threads;
1348 struct igt_live_test live;
1349 intel_wakeref_t wakeref;
1350 struct smoketest *smoke;
1351 unsigned int n, idx;
1356 * Smoketest our breadcrumb/signal handling for requests across multiple
1357 * threads. A very simple test to only catch the most egregious of bugs.
1358 * See __igt_breadcrumbs_smoketest();
1360 * On real hardware this time.
1363 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1365 file = mock_file(i915);
1367 ret = PTR_ERR(file);
1371 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1377 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1383 smoke[0].request_alloc = __live_request_alloc;
1384 smoke[0].ncontexts = 64;
1385 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1386 sizeof(*smoke[0].contexts),
1388 if (!smoke[0].contexts) {
1393 for (n = 0; n < smoke[0].ncontexts; n++) {
1394 smoke[0].contexts[n] = live_context(i915, file);
1395 if (!smoke[0].contexts[n]) {
1401 ret = igt_live_test_begin(&live, i915, __func__, "");
1406 for_each_uabi_engine(engine, i915) {
1407 smoke[idx] = smoke[0];
1408 smoke[idx].engine = engine;
1409 smoke[idx].max_batch =
1410 max_batches(smoke[0].contexts[0], engine);
1411 if (smoke[idx].max_batch < 0) {
1412 ret = smoke[idx].max_batch;
1415 /* One ring interleaved between requests from all cpus */
1416 smoke[idx].max_batch /= num_online_cpus() + 1;
1417 pr_debug("Limiting batches to %d requests on %s\n",
1418 smoke[idx].max_batch, engine->name);
1420 for (n = 0; n < ncpus; n++) {
1421 struct task_struct *tsk;
1423 tsk = kthread_run(__igt_breadcrumbs_smoketest,
1424 &smoke[idx], "igt/%d.%d", idx, n);
1430 get_task_struct(tsk);
1431 threads[idx * ncpus + n] = tsk;
1437 yield(); /* start all threads before we begin */
1438 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1444 for_each_uabi_engine(engine, i915) {
1445 for (n = 0; n < ncpus; n++) {
1446 struct task_struct *tsk = threads[idx * ncpus + n];
1452 err = kthread_stop(tsk);
1453 if (err < 0 && !ret)
1456 put_task_struct(tsk);
1459 num_waits += atomic_long_read(&smoke[idx].num_waits);
1460 num_fences += atomic_long_read(&smoke[idx].num_fences);
1463 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1464 num_waits, num_fences, idx, ncpus);
1466 ret = igt_live_test_end(&live) ?: ret;
1468 kfree(smoke[0].contexts);
1476 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1481 int i915_request_live_selftests(struct drm_i915_private *i915)
1483 static const struct i915_subtest tests[] = {
1484 SUBTEST(live_nop_request),
1485 SUBTEST(live_all_engines),
1486 SUBTEST(live_sequential_engines),
1487 SUBTEST(live_parallel_engines),
1488 SUBTEST(live_empty_request),
1489 SUBTEST(live_breadcrumbs_smoketest),
1492 if (intel_gt_is_wedged(&i915->gt))
1495 return i915_subtests(tests, i915);
1498 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1500 struct i915_request *rq;
1501 struct dma_fence *fence;
1503 rq = intel_engine_create_kernel_request(ce->engine);
1507 fence = i915_active_fence_get(&ce->timeline->last_request);
1509 i915_request_await_dma_fence(rq, fence);
1510 dma_fence_put(fence);
1513 rq = i915_request_get(rq);
1514 i915_request_add(rq);
1515 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1517 i915_request_put(rq);
1519 while (!err && !intel_engine_is_idle(ce->engine))
1520 intel_engine_flush_submission(ce->engine);
1526 struct intel_engine_cs *engine;
1527 unsigned long count;
1533 struct perf_series {
1534 struct drm_i915_private *i915;
1535 unsigned int nengines;
1536 struct intel_context *ce[];
1539 static int cmp_u32(const void *A, const void *B)
1541 const u32 *a = A, *b = B;
1546 static u32 trifilter(u32 *a)
1551 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1553 sum = mul_u32_u32(a[2], 2);
1557 GEM_BUG_ON(sum > U32_MAX);
1562 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1564 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1566 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1569 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1571 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1572 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1579 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1581 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1589 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1591 *cs++ = MI_SEMAPHORE_WAIT |
1592 MI_SEMAPHORE_GLOBAL_GTT |
1602 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1604 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1607 static void semaphore_set(u32 *sema, u32 value)
1609 WRITE_ONCE(*sema, value);
1610 wmb(); /* flush the update to the cache, and beyond */
1613 static u32 *hwsp_scratch(const struct intel_context *ce)
1615 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1618 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1620 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1621 offset_in_page(dw));
1624 static int measure_semaphore_response(struct intel_context *ce)
1626 u32 *sema = hwsp_scratch(ce);
1627 const u32 offset = hwsp_offset(ce, sema);
1628 u32 elapsed[TF_COUNT], cycles;
1629 struct i915_request *rq;
1635 * Measure how many cycles it takes for the HW to detect the change
1636 * in a semaphore value.
1638 * A: read CS_TIMESTAMP from CPU
1640 * B: read CS_TIMESTAMP on GPU
1642 * Semaphore latency: B - A
1645 semaphore_set(sema, -1);
1647 rq = i915_request_create(ce);
1651 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1653 i915_request_add(rq);
1658 cs = emit_store_dw(cs, offset, 0);
1659 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1660 cs = emit_semaphore_poll_until(cs, offset, i);
1661 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1662 cs = emit_store_dw(cs, offset, 0);
1665 intel_ring_advance(rq, cs);
1666 i915_request_add(rq);
1668 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1673 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1675 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1676 semaphore_set(sema, i);
1679 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1684 elapsed[i - 1] = sema[i] - cycles;
1687 cycles = trifilter(elapsed);
1688 pr_info("%s: semaphore response %d cycles, %lluns\n",
1689 ce->engine->name, cycles >> TF_BIAS,
1690 cycles_to_ns(ce->engine, cycles));
1692 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1695 intel_gt_set_wedged(ce->engine->gt);
1699 static int measure_idle_dispatch(struct intel_context *ce)
1701 u32 *sema = hwsp_scratch(ce);
1702 const u32 offset = hwsp_offset(ce, sema);
1703 u32 elapsed[TF_COUNT], cycles;
1709 * Measure how long it takes for us to submit a request while the
1710 * engine is idle, but is resting in our context.
1712 * A: read CS_TIMESTAMP from CPU
1714 * B: read CS_TIMESTAMP on GPU
1716 * Submission latency: B - A
1719 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1720 struct i915_request *rq;
1722 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1726 rq = i915_request_create(ce);
1732 cs = intel_ring_begin(rq, 4);
1734 i915_request_add(rq);
1739 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1741 intel_ring_advance(rq, cs);
1745 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1746 i915_request_add(rq);
1751 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1755 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1756 elapsed[i] = sema[i] - elapsed[i];
1758 cycles = trifilter(elapsed);
1759 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1760 ce->engine->name, cycles >> TF_BIAS,
1761 cycles_to_ns(ce->engine, cycles));
1763 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1766 intel_gt_set_wedged(ce->engine->gt);
1770 static int measure_busy_dispatch(struct intel_context *ce)
1772 u32 *sema = hwsp_scratch(ce);
1773 const u32 offset = hwsp_offset(ce, sema);
1774 u32 elapsed[TF_COUNT + 1], cycles;
1780 * Measure how long it takes for us to submit a request while the
1781 * engine is busy, polling on a semaphore in our context. With
1782 * direct submission, this will include the cost of a lite restore.
1784 * A: read CS_TIMESTAMP from CPU
1786 * B: read CS_TIMESTAMP on GPU
1788 * Submission latency: B - A
1791 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1792 struct i915_request *rq;
1794 rq = i915_request_create(ce);
1800 cs = intel_ring_begin(rq, 12);
1802 i915_request_add(rq);
1807 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1808 cs = emit_semaphore_poll_until(cs, offset, i);
1809 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1811 intel_ring_advance(rq, cs);
1813 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1820 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1821 i915_request_add(rq);
1823 semaphore_set(sema, i - 1);
1827 wait_for(READ_ONCE(sema[i - 1]), 500);
1828 semaphore_set(sema, i - 1);
1830 for (i = 1; i <= TF_COUNT; i++) {
1831 GEM_BUG_ON(sema[i] == -1);
1832 elapsed[i - 1] = sema[i] - elapsed[i];
1835 cycles = trifilter(elapsed);
1836 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1837 ce->engine->name, cycles >> TF_BIAS,
1838 cycles_to_ns(ce->engine, cycles));
1840 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1843 intel_gt_set_wedged(ce->engine->gt);
1847 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1850 i915_ggtt_offset(engine->status_page.vma) +
1851 offset_in_page(sema);
1852 struct i915_request *rq;
1855 rq = i915_request_create(engine->kernel_context);
1859 cs = intel_ring_begin(rq, 4);
1861 i915_request_add(rq);
1865 cs = emit_semaphore_poll(cs, mode, value, offset);
1867 intel_ring_advance(rq, cs);
1868 i915_request_add(rq);
1873 static int measure_inter_request(struct intel_context *ce)
1875 u32 *sema = hwsp_scratch(ce);
1876 const u32 offset = hwsp_offset(ce, sema);
1877 u32 elapsed[TF_COUNT + 1], cycles;
1878 struct i915_sw_fence *submit;
1882 * Measure how long it takes to advance from one request into the
1883 * next. Between each request we flush the GPU caches to memory,
1884 * update the breadcrumbs, and then invalidate those caches.
1885 * We queue up all the requests to be submitted in one batch so
1886 * it should be one set of contiguous measurements.
1888 * A: read CS_TIMESTAMP on GPU
1890 * B: read CS_TIMESTAMP on GPU
1892 * Request latency: B - A
1895 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1899 submit = heap_fence_create(GFP_KERNEL);
1901 semaphore_set(sema, 1);
1905 intel_engine_flush_submission(ce->engine);
1906 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1907 struct i915_request *rq;
1910 rq = i915_request_create(ce);
1916 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1920 i915_request_add(rq);
1924 cs = intel_ring_begin(rq, 4);
1926 i915_request_add(rq);
1931 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1933 intel_ring_advance(rq, cs);
1934 i915_request_add(rq);
1936 i915_sw_fence_commit(submit);
1937 intel_engine_flush_submission(ce->engine);
1938 heap_fence_put(submit);
1940 semaphore_set(sema, 1);
1941 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1945 for (i = 1; i <= TF_COUNT; i++)
1946 elapsed[i - 1] = sema[i + 1] - sema[i];
1948 cycles = trifilter(elapsed);
1949 pr_info("%s: inter-request latency %d cycles, %lluns\n",
1950 ce->engine->name, cycles >> TF_BIAS,
1951 cycles_to_ns(ce->engine, cycles));
1953 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1956 i915_sw_fence_commit(submit);
1957 heap_fence_put(submit);
1958 semaphore_set(sema, 1);
1960 intel_gt_set_wedged(ce->engine->gt);
1964 static int measure_context_switch(struct intel_context *ce)
1966 u32 *sema = hwsp_scratch(ce);
1967 const u32 offset = hwsp_offset(ce, sema);
1968 struct i915_request *fence = NULL;
1969 u32 elapsed[TF_COUNT + 1], cycles;
1974 * Measure how long it takes to advance from one request in one
1975 * context to a request in another context. This allows us to
1976 * measure how long the context save/restore take, along with all
1977 * the inter-context setup we require.
1979 * A: read CS_TIMESTAMP on GPU
1981 * B: read CS_TIMESTAMP on GPU
1983 * Context switch latency: B - A
1986 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1990 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1991 struct intel_context *arr[] = {
1992 ce, ce->engine->kernel_context
1994 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1996 for (j = 0; j < ARRAY_SIZE(arr); j++) {
1997 struct i915_request *rq;
1999 rq = i915_request_create(arr[j]);
2006 err = i915_request_await_dma_fence(rq,
2009 i915_request_add(rq);
2014 cs = intel_ring_begin(rq, 4);
2016 i915_request_add(rq);
2021 cs = emit_timestamp_store(cs, ce, addr);
2022 addr += sizeof(u32);
2024 intel_ring_advance(rq, cs);
2026 i915_request_put(fence);
2027 fence = i915_request_get(rq);
2029 i915_request_add(rq);
2032 i915_request_put(fence);
2033 intel_engine_flush_submission(ce->engine);
2035 semaphore_set(sema, 1);
2036 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2040 for (i = 1; i <= TF_COUNT; i++)
2041 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2043 cycles = trifilter(elapsed);
2044 pr_info("%s: context switch latency %d cycles, %lluns\n",
2045 ce->engine->name, cycles >> TF_BIAS,
2046 cycles_to_ns(ce->engine, cycles));
2048 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2051 i915_request_put(fence);
2052 semaphore_set(sema, 1);
2054 intel_gt_set_wedged(ce->engine->gt);
2058 static int measure_preemption(struct intel_context *ce)
2060 u32 *sema = hwsp_scratch(ce);
2061 const u32 offset = hwsp_offset(ce, sema);
2062 u32 elapsed[TF_COUNT], cycles;
2068 * We measure two latencies while triggering preemption. The first
2069 * latency is how long it takes for us to submit a preempting request.
2070 * The second latency is how it takes for us to return from the
2071 * preemption back to the original context.
2073 * A: read CS_TIMESTAMP from CPU
2075 * B: read CS_TIMESTAMP on GPU (in preempting context)
2077 * C: read CS_TIMESTAMP on GPU (in original context)
2079 * Preemption dispatch latency: B - A
2080 * Preemption switch latency: C - B
2083 if (!intel_engine_has_preemption(ce->engine))
2086 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2087 u32 addr = offset + 2 * i * sizeof(u32);
2088 struct i915_request *rq;
2090 rq = i915_request_create(ce);
2096 cs = intel_ring_begin(rq, 12);
2098 i915_request_add(rq);
2103 cs = emit_store_dw(cs, addr, -1);
2104 cs = emit_semaphore_poll_until(cs, offset, i);
2105 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2107 intel_ring_advance(rq, cs);
2108 i915_request_add(rq);
2110 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2115 rq = i915_request_create(ce->engine->kernel_context);
2121 cs = intel_ring_begin(rq, 8);
2123 i915_request_add(rq);
2128 cs = emit_timestamp_store(cs, ce, addr);
2129 cs = emit_store_dw(cs, offset, i);
2131 intel_ring_advance(rq, cs);
2132 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2134 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2135 i915_request_add(rq);
2138 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2143 for (i = 1; i <= TF_COUNT; i++)
2144 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2146 cycles = trifilter(elapsed);
2147 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2148 ce->engine->name, cycles >> TF_BIAS,
2149 cycles_to_ns(ce->engine, cycles));
2151 for (i = 1; i <= TF_COUNT; i++)
2152 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2154 cycles = trifilter(elapsed);
2155 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2156 ce->engine->name, cycles >> TF_BIAS,
2157 cycles_to_ns(ce->engine, cycles));
2159 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2162 intel_gt_set_wedged(ce->engine->gt);
2167 struct dma_fence_cb base;
2171 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2173 struct signal_cb *s = container_of(cb, typeof(*s), base);
2175 smp_store_mb(s->seen, true); /* be safe, be strong */
2178 static int measure_completion(struct intel_context *ce)
2180 u32 *sema = hwsp_scratch(ce);
2181 const u32 offset = hwsp_offset(ce, sema);
2182 u32 elapsed[TF_COUNT], cycles;
2188 * Measure how long it takes for the signal (interrupt) to be
2189 * sent from the GPU to be processed by the CPU.
2191 * A: read CS_TIMESTAMP on GPU
2193 * B: read CS_TIMESTAMP from CPU
2195 * Completion latency: B - A
2198 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2199 struct signal_cb cb = { .seen = false };
2200 struct i915_request *rq;
2202 rq = i915_request_create(ce);
2208 cs = intel_ring_begin(rq, 12);
2210 i915_request_add(rq);
2215 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2216 cs = emit_semaphore_poll_until(cs, offset, i);
2217 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2219 intel_ring_advance(rq, cs);
2221 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2222 i915_request_add(rq);
2224 intel_engine_flush_submission(ce->engine);
2225 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2231 semaphore_set(sema, i);
2232 while (!READ_ONCE(cb.seen))
2235 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2239 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2243 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2244 GEM_BUG_ON(sema[i + 1] == -1);
2245 elapsed[i] = elapsed[i] - sema[i + 1];
2248 cycles = trifilter(elapsed);
2249 pr_info("%s: completion latency %d cycles, %lluns\n",
2250 ce->engine->name, cycles >> TF_BIAS,
2251 cycles_to_ns(ce->engine, cycles));
2253 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2256 intel_gt_set_wedged(ce->engine->gt);
2260 static void rps_pin(struct intel_gt *gt)
2262 /* Pin the frequency to max */
2263 atomic_inc(>->rps.num_waiters);
2264 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2266 mutex_lock(>->rps.lock);
2267 intel_rps_set(>->rps, gt->rps.max_freq);
2268 mutex_unlock(>->rps.lock);
2271 static void rps_unpin(struct intel_gt *gt)
2273 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2274 atomic_dec(>->rps.num_waiters);
2277 static int perf_request_latency(void *arg)
2279 struct drm_i915_private *i915 = arg;
2280 struct intel_engine_cs *engine;
2281 struct pm_qos_request qos;
2284 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2287 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2289 for_each_uabi_engine(engine, i915) {
2290 struct intel_context *ce;
2292 ce = intel_context_create(engine);
2298 err = intel_context_pin(ce);
2300 intel_context_put(ce);
2304 st_engine_heartbeat_disable(engine);
2305 rps_pin(engine->gt);
2308 err = measure_semaphore_response(ce);
2310 err = measure_idle_dispatch(ce);
2312 err = measure_busy_dispatch(ce);
2314 err = measure_inter_request(ce);
2316 err = measure_context_switch(ce);
2318 err = measure_preemption(ce);
2320 err = measure_completion(ce);
2322 rps_unpin(engine->gt);
2323 st_engine_heartbeat_enable(engine);
2325 intel_context_unpin(ce);
2326 intel_context_put(ce);
2332 if (igt_flush_test(i915))
2335 cpu_latency_qos_remove_request(&qos);
2339 static int s_sync0(void *arg)
2341 struct perf_series *ps = arg;
2342 IGT_TIMEOUT(end_time);
2343 unsigned int idx = 0;
2346 GEM_BUG_ON(!ps->nengines);
2348 struct i915_request *rq;
2350 rq = i915_request_create(ps->ce[idx]);
2356 i915_request_get(rq);
2357 i915_request_add(rq);
2359 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2361 i915_request_put(rq);
2365 if (++idx == ps->nengines)
2367 } while (!__igt_timeout(end_time, NULL));
2372 static int s_sync1(void *arg)
2374 struct perf_series *ps = arg;
2375 struct i915_request *prev = NULL;
2376 IGT_TIMEOUT(end_time);
2377 unsigned int idx = 0;
2380 GEM_BUG_ON(!ps->nengines);
2382 struct i915_request *rq;
2384 rq = i915_request_create(ps->ce[idx]);
2390 i915_request_get(rq);
2391 i915_request_add(rq);
2393 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2395 i915_request_put(prev);
2400 if (++idx == ps->nengines)
2402 } while (!__igt_timeout(end_time, NULL));
2403 i915_request_put(prev);
2408 static int s_many(void *arg)
2410 struct perf_series *ps = arg;
2411 IGT_TIMEOUT(end_time);
2412 unsigned int idx = 0;
2414 GEM_BUG_ON(!ps->nengines);
2416 struct i915_request *rq;
2418 rq = i915_request_create(ps->ce[idx]);
2422 i915_request_add(rq);
2424 if (++idx == ps->nengines)
2426 } while (!__igt_timeout(end_time, NULL));
2431 static int perf_series_engines(void *arg)
2433 struct drm_i915_private *i915 = arg;
2434 static int (* const func[])(void *arg) = {
2440 const unsigned int nengines = num_uabi_engines(i915);
2441 struct intel_engine_cs *engine;
2442 int (* const *fn)(void *arg);
2443 struct pm_qos_request qos;
2444 struct perf_stats *stats;
2445 struct perf_series *ps;
2449 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2453 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2459 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2462 ps->nengines = nengines;
2465 for_each_uabi_engine(engine, i915) {
2466 struct intel_context *ce;
2468 ce = intel_context_create(engine);
2474 err = intel_context_pin(ce);
2476 intel_context_put(ce);
2482 GEM_BUG_ON(idx != ps->nengines);
2484 for (fn = func; *fn && !err; fn++) {
2485 char name[KSYM_NAME_LEN];
2486 struct igt_live_test t;
2488 snprintf(name, sizeof(name), "%ps", *fn);
2489 err = igt_live_test_begin(&t, i915, __func__, name);
2493 for (idx = 0; idx < nengines; idx++) {
2494 struct perf_stats *p =
2495 memset(&stats[idx], 0, sizeof(stats[idx]));
2496 struct intel_context *ce = ps->ce[idx];
2498 p->engine = ps->ce[idx]->engine;
2499 intel_engine_pm_get(p->engine);
2501 if (intel_engine_supports_stats(p->engine))
2502 p->busy = intel_engine_get_busy_time(p->engine,
2505 p->time = ktime_get();
2506 p->runtime = -intel_context_get_total_runtime_ns(ce);
2510 if (igt_live_test_end(&t))
2513 for (idx = 0; idx < nengines; idx++) {
2514 struct perf_stats *p = &stats[idx];
2515 struct intel_context *ce = ps->ce[idx];
2516 int integer, decimal;
2520 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2525 p->time = ktime_sub(now, p->time);
2527 err = switch_to_kernel_sync(ce, err);
2528 p->runtime += intel_context_get_total_runtime_ns(ce);
2529 intel_engine_pm_put(p->engine);
2531 busy = 100 * ktime_to_ns(p->busy);
2532 dt = ktime_to_ns(p->time);
2534 integer = div64_u64(busy, dt);
2535 busy -= integer * dt;
2536 decimal = div64_u64(100 * busy, dt);
2542 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2543 name, p->engine->name, ce->timeline->seqno,
2545 div_u64(p->runtime, 1000 * 1000),
2546 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2551 for (idx = 0; idx < nengines; idx++) {
2552 if (IS_ERR_OR_NULL(ps->ce[idx]))
2555 intel_context_unpin(ps->ce[idx]);
2556 intel_context_put(ps->ce[idx]);
2560 cpu_latency_qos_remove_request(&qos);
2565 static int p_sync0(void *arg)
2567 struct perf_stats *p = arg;
2568 struct intel_engine_cs *engine = p->engine;
2569 struct intel_context *ce;
2570 IGT_TIMEOUT(end_time);
2571 unsigned long count;
2575 ce = intel_context_create(engine);
2579 err = intel_context_pin(ce);
2581 intel_context_put(ce);
2585 if (intel_engine_supports_stats(engine)) {
2586 p->busy = intel_engine_get_busy_time(engine, &p->time);
2589 p->time = ktime_get();
2595 struct i915_request *rq;
2597 rq = i915_request_create(ce);
2603 i915_request_get(rq);
2604 i915_request_add(rq);
2607 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2609 i915_request_put(rq);
2614 } while (!__igt_timeout(end_time, NULL));
2619 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2621 p->time = ktime_sub(now, p->time);
2623 p->time = ktime_sub(ktime_get(), p->time);
2626 err = switch_to_kernel_sync(ce, err);
2627 p->runtime = intel_context_get_total_runtime_ns(ce);
2630 intel_context_unpin(ce);
2631 intel_context_put(ce);
2635 static int p_sync1(void *arg)
2637 struct perf_stats *p = arg;
2638 struct intel_engine_cs *engine = p->engine;
2639 struct i915_request *prev = NULL;
2640 struct intel_context *ce;
2641 IGT_TIMEOUT(end_time);
2642 unsigned long count;
2646 ce = intel_context_create(engine);
2650 err = intel_context_pin(ce);
2652 intel_context_put(ce);
2656 if (intel_engine_supports_stats(engine)) {
2657 p->busy = intel_engine_get_busy_time(engine, &p->time);
2660 p->time = ktime_get();
2666 struct i915_request *rq;
2668 rq = i915_request_create(ce);
2674 i915_request_get(rq);
2675 i915_request_add(rq);
2678 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2680 i915_request_put(prev);
2686 } while (!__igt_timeout(end_time, NULL));
2687 i915_request_put(prev);
2692 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2694 p->time = ktime_sub(now, p->time);
2696 p->time = ktime_sub(ktime_get(), p->time);
2699 err = switch_to_kernel_sync(ce, err);
2700 p->runtime = intel_context_get_total_runtime_ns(ce);
2703 intel_context_unpin(ce);
2704 intel_context_put(ce);
2708 static int p_many(void *arg)
2710 struct perf_stats *p = arg;
2711 struct intel_engine_cs *engine = p->engine;
2712 struct intel_context *ce;
2713 IGT_TIMEOUT(end_time);
2714 unsigned long count;
2718 ce = intel_context_create(engine);
2722 err = intel_context_pin(ce);
2724 intel_context_put(ce);
2728 if (intel_engine_supports_stats(engine)) {
2729 p->busy = intel_engine_get_busy_time(engine, &p->time);
2732 p->time = ktime_get();
2738 struct i915_request *rq;
2740 rq = i915_request_create(ce);
2746 i915_request_add(rq);
2748 } while (!__igt_timeout(end_time, NULL));
2753 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2755 p->time = ktime_sub(now, p->time);
2757 p->time = ktime_sub(ktime_get(), p->time);
2760 err = switch_to_kernel_sync(ce, err);
2761 p->runtime = intel_context_get_total_runtime_ns(ce);
2764 intel_context_unpin(ce);
2765 intel_context_put(ce);
2769 static int perf_parallel_engines(void *arg)
2771 struct drm_i915_private *i915 = arg;
2772 static int (* const func[])(void *arg) = {
2778 const unsigned int nengines = num_uabi_engines(i915);
2779 struct intel_engine_cs *engine;
2780 int (* const *fn)(void *arg);
2781 struct pm_qos_request qos;
2783 struct perf_stats p;
2784 struct task_struct *tsk;
2788 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2792 cpu_latency_qos_add_request(&qos, 0);
2794 for (fn = func; *fn; fn++) {
2795 char name[KSYM_NAME_LEN];
2796 struct igt_live_test t;
2799 snprintf(name, sizeof(name), "%ps", *fn);
2800 err = igt_live_test_begin(&t, i915, __func__, name);
2804 atomic_set(&i915->selftest.counter, nengines);
2807 for_each_uabi_engine(engine, i915) {
2808 intel_engine_pm_get(engine);
2810 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2811 engines[idx].p.engine = engine;
2813 engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2814 "igt:%s", engine->name);
2815 if (IS_ERR(engines[idx].tsk)) {
2816 err = PTR_ERR(engines[idx].tsk);
2817 intel_engine_pm_put(engine);
2820 get_task_struct(engines[idx++].tsk);
2823 yield(); /* start all threads before we kthread_stop() */
2826 for_each_uabi_engine(engine, i915) {
2829 if (IS_ERR(engines[idx].tsk))
2832 status = kthread_stop(engines[idx].tsk);
2836 intel_engine_pm_put(engine);
2837 put_task_struct(engines[idx++].tsk);
2840 if (igt_live_test_end(&t))
2846 for_each_uabi_engine(engine, i915) {
2847 struct perf_stats *p = &engines[idx].p;
2848 u64 busy = 100 * ktime_to_ns(p->busy);
2849 u64 dt = ktime_to_ns(p->time);
2850 int integer, decimal;
2853 integer = div64_u64(busy, dt);
2854 busy -= integer * dt;
2855 decimal = div64_u64(100 * busy, dt);
2861 GEM_BUG_ON(engine != p->engine);
2862 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2863 name, engine->name, p->count, integer, decimal,
2864 div_u64(p->runtime, 1000 * 1000),
2865 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2870 cpu_latency_qos_remove_request(&qos);
2875 int i915_request_perf_selftests(struct drm_i915_private *i915)
2877 static const struct i915_subtest tests[] = {
2878 SUBTEST(perf_request_latency),
2879 SUBTEST(perf_series_engines),
2880 SUBTEST(perf_parallel_engines),
2883 if (intel_gt_is_wedged(&i915->gt))
2886 return i915_subtests(tests, i915);