1 // SPDX-License-Identifier: MIT
3 * Copyright © 2016 Intel Corporation
6 #include <linux/kthread.h>
8 #include "gem/i915_gem_context.h"
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "selftest_engine_heartbeat.h"
15 #include "i915_selftest.h"
16 #include "selftests/i915_random.h"
17 #include "selftests/igt_flush_test.h"
18 #include "selftests/igt_reset.h"
19 #include "selftests/igt_atomic.h"
21 #include "selftests/mock_drm.h"
23 #include "gem/selftests/mock_context.h"
24 #include "gem/selftests/igt_gem_utils.h"
26 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
30 struct drm_i915_gem_object *hws;
31 struct drm_i915_gem_object *obj;
32 struct i915_gem_context *ctx;
37 static int hang_init(struct hang *h, struct intel_gt *gt)
42 memset(h, 0, sizeof(*h));
45 h->ctx = kernel_context(gt->i915);
47 return PTR_ERR(h->ctx);
49 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
51 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
53 err = PTR_ERR(h->hws);
57 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
59 err = PTR_ERR(h->obj);
63 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
64 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
69 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
71 vaddr = i915_gem_object_pin_map_unlocked(h->obj,
72 i915_coherent_map_type(gt->i915, h->obj, false));
82 i915_gem_object_unpin_map(h->hws);
84 i915_gem_object_put(h->obj);
86 i915_gem_object_put(h->hws);
88 kernel_context_close(h->ctx);
92 static u64 hws_address(const struct i915_vma *hws,
93 const struct i915_request *rq)
95 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
98 static int move_to_active(struct i915_vma *vma,
99 struct i915_request *rq,
105 err = i915_request_await_object(rq, vma->obj,
106 flags & EXEC_OBJECT_WRITE);
108 err = i915_vma_move_to_active(vma, rq, flags);
109 i915_vma_unlock(vma);
114 static struct i915_request *
115 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
117 struct intel_gt *gt = h->gt;
118 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
119 struct drm_i915_gem_object *obj;
120 struct i915_request *rq = NULL;
121 struct i915_vma *hws, *vma;
127 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
130 return ERR_CAST(obj);
133 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
135 i915_gem_object_put(obj);
137 return ERR_CAST(vaddr);
140 i915_gem_object_unpin_map(h->obj);
141 i915_gem_object_put(h->obj);
146 vma = i915_vma_instance(h->obj, vm, NULL);
149 return ERR_CAST(vma);
152 hws = i915_vma_instance(h->hws, vm, NULL);
155 return ERR_CAST(hws);
158 err = i915_vma_pin(vma, 0, 0, PIN_USER);
164 err = i915_vma_pin(hws, 0, 0, PIN_USER);
168 rq = igt_request_alloc(h->ctx, engine);
174 err = move_to_active(vma, rq, 0);
178 err = move_to_active(hws, rq, 0);
183 if (GRAPHICS_VER(gt->i915) >= 8) {
184 *batch++ = MI_STORE_DWORD_IMM_GEN4;
185 *batch++ = lower_32_bits(hws_address(hws, rq));
186 *batch++ = upper_32_bits(hws_address(hws, rq));
187 *batch++ = rq->fence.seqno;
190 memset(batch, 0, 1024);
191 batch += 1024 / sizeof(*batch);
194 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
195 *batch++ = lower_32_bits(vma->node.start);
196 *batch++ = upper_32_bits(vma->node.start);
197 } else if (GRAPHICS_VER(gt->i915) >= 6) {
198 *batch++ = MI_STORE_DWORD_IMM_GEN4;
200 *batch++ = lower_32_bits(hws_address(hws, rq));
201 *batch++ = rq->fence.seqno;
204 memset(batch, 0, 1024);
205 batch += 1024 / sizeof(*batch);
208 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
209 *batch++ = lower_32_bits(vma->node.start);
210 } else if (GRAPHICS_VER(gt->i915) >= 4) {
211 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
213 *batch++ = lower_32_bits(hws_address(hws, rq));
214 *batch++ = rq->fence.seqno;
217 memset(batch, 0, 1024);
218 batch += 1024 / sizeof(*batch);
221 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
222 *batch++ = lower_32_bits(vma->node.start);
224 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
225 *batch++ = lower_32_bits(hws_address(hws, rq));
226 *batch++ = rq->fence.seqno;
229 memset(batch, 0, 1024);
230 batch += 1024 / sizeof(*batch);
233 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
234 *batch++ = lower_32_bits(vma->node.start);
236 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
237 intel_gt_chipset_flush(engine->gt);
239 if (rq->engine->emit_init_breadcrumb) {
240 err = rq->engine->emit_init_breadcrumb(rq);
246 if (GRAPHICS_VER(gt->i915) <= 5)
247 flags |= I915_DISPATCH_SECURE;
249 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
253 i915_request_set_error_once(rq, err);
254 i915_request_add(rq);
261 return err ? ERR_PTR(err) : rq;
264 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
266 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
269 static void hang_fini(struct hang *h)
271 *h->batch = MI_BATCH_BUFFER_END;
272 intel_gt_chipset_flush(h->gt);
274 i915_gem_object_unpin_map(h->obj);
275 i915_gem_object_put(h->obj);
277 i915_gem_object_unpin_map(h->hws);
278 i915_gem_object_put(h->hws);
280 kernel_context_close(h->ctx);
282 igt_flush_test(h->gt->i915);
285 static bool wait_until_running(struct hang *h, struct i915_request *rq)
287 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
290 wait_for(i915_seqno_passed(hws_seqno(h, rq),
295 static int igt_hang_sanitycheck(void *arg)
297 struct intel_gt *gt = arg;
298 struct i915_request *rq;
299 struct intel_engine_cs *engine;
300 enum intel_engine_id id;
304 /* Basic check that we can execute our hanging batch */
306 err = hang_init(&h, gt);
310 for_each_engine(engine, gt, id) {
311 struct intel_wedge_me w;
314 if (!intel_engine_can_store_dword(engine))
317 rq = hang_create_request(&h, engine);
320 pr_err("Failed to create request for %s, err=%d\n",
325 i915_request_get(rq);
327 *h.batch = MI_BATCH_BUFFER_END;
328 intel_gt_chipset_flush(engine->gt);
330 i915_request_add(rq);
333 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
334 timeout = i915_request_wait(rq, 0,
335 MAX_SCHEDULE_TIMEOUT);
336 if (intel_gt_is_wedged(gt))
339 i915_request_put(rq);
343 pr_err("Wait for request failed on %s, err=%d\n",
354 static bool wait_for_idle(struct intel_engine_cs *engine)
356 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
359 static int igt_reset_nop(void *arg)
361 struct intel_gt *gt = arg;
362 struct i915_gpu_error *global = >->i915->gpu_error;
363 struct intel_engine_cs *engine;
364 unsigned int reset_count, count;
365 enum intel_engine_id id;
366 IGT_TIMEOUT(end_time);
369 /* Check that we can reset during non-user portions of requests */
371 reset_count = i915_reset_count(global);
374 for_each_engine(engine, gt, id) {
375 struct intel_context *ce;
378 ce = intel_context_create(engine);
384 for (i = 0; i < 16; i++) {
385 struct i915_request *rq;
387 rq = intel_context_create_request(ce);
393 i915_request_add(rq);
396 intel_context_put(ce);
399 igt_global_reset_lock(gt);
400 intel_gt_reset(gt, ALL_ENGINES, NULL);
401 igt_global_reset_unlock(gt);
403 if (intel_gt_is_wedged(gt)) {
408 if (i915_reset_count(global) != reset_count + ++count) {
409 pr_err("Full GPU reset not recorded!\n");
414 err = igt_flush_test(gt->i915);
417 } while (time_before(jiffies, end_time));
418 pr_info("%s: %d resets\n", __func__, count);
420 if (igt_flush_test(gt->i915))
425 static int igt_reset_nop_engine(void *arg)
427 struct intel_gt *gt = arg;
428 struct i915_gpu_error *global = >->i915->gpu_error;
429 struct intel_engine_cs *engine;
430 enum intel_engine_id id;
432 /* Check that we can engine-reset during non-user portions */
434 if (!intel_has_reset_engine(gt))
437 for_each_engine(engine, gt, id) {
438 unsigned int reset_count, reset_engine_count, count;
439 struct intel_context *ce;
440 IGT_TIMEOUT(end_time);
443 ce = intel_context_create(engine);
447 reset_count = i915_reset_count(global);
448 reset_engine_count = i915_reset_engine_count(global, engine);
451 st_engine_heartbeat_disable(engine);
452 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
456 if (!wait_for_idle(engine)) {
457 pr_err("%s failed to idle before reset\n",
463 for (i = 0; i < 16; i++) {
464 struct i915_request *rq;
466 rq = intel_context_create_request(ce);
468 struct drm_printer p =
469 drm_info_printer(gt->i915->drm.dev);
470 intel_engine_dump(engine, &p,
471 "%s(%s): failed to submit request\n",
475 GEM_TRACE("%s(%s): failed to submit request\n",
480 intel_gt_set_wedged(gt);
486 i915_request_add(rq);
488 err = intel_engine_reset(engine, NULL);
490 pr_err("intel_engine_reset(%s) failed, err:%d\n",
495 if (i915_reset_count(global) != reset_count) {
496 pr_err("Full GPU reset recorded! (engine reset expected)\n");
501 if (i915_reset_engine_count(global, engine) !=
502 reset_engine_count + ++count) {
503 pr_err("%s engine reset not recorded!\n",
508 } while (time_before(jiffies, end_time));
509 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
510 st_engine_heartbeat_enable(engine);
512 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
514 intel_context_put(ce);
515 if (igt_flush_test(gt->i915))
524 static void force_reset_timeout(struct intel_engine_cs *engine)
526 engine->reset_timeout.probability = 999;
527 atomic_set(&engine->reset_timeout.times, -1);
530 static void cancel_reset_timeout(struct intel_engine_cs *engine)
532 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
535 static int igt_reset_fail_engine(void *arg)
537 struct intel_gt *gt = arg;
538 struct intel_engine_cs *engine;
539 enum intel_engine_id id;
541 /* Check that we can recover from engine-reset failues */
543 if (!intel_has_reset_engine(gt))
546 for_each_engine(engine, gt, id) {
548 struct intel_context *ce;
549 IGT_TIMEOUT(end_time);
552 ce = intel_context_create(engine);
556 st_engine_heartbeat_disable(engine);
557 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
559 force_reset_timeout(engine);
560 err = intel_engine_reset(engine, NULL);
561 cancel_reset_timeout(engine);
562 if (err == 0) /* timeouts only generated on gen8+ */
567 struct i915_request *last = NULL;
570 if (!wait_for_idle(engine)) {
571 pr_err("%s failed to idle before reset\n",
577 for (i = 0; i < count % 15; i++) {
578 struct i915_request *rq;
580 rq = intel_context_create_request(ce);
582 struct drm_printer p =
583 drm_info_printer(gt->i915->drm.dev);
584 intel_engine_dump(engine, &p,
585 "%s(%s): failed to submit request\n",
589 GEM_TRACE("%s(%s): failed to submit request\n",
594 intel_gt_set_wedged(gt);
596 i915_request_put(last);
603 i915_request_put(last);
604 last = i915_request_get(rq);
605 i915_request_add(rq);
609 err = intel_engine_reset(engine, NULL);
611 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
614 i915_request_put(last);
618 force_reset_timeout(engine);
619 err = intel_engine_reset(engine, NULL);
620 cancel_reset_timeout(engine);
621 if (err != -ETIMEDOUT) {
622 pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
624 i915_request_put(last);
631 if (i915_request_wait(last, 0, HZ / 2) < 0) {
632 struct drm_printer p =
633 drm_info_printer(gt->i915->drm.dev);
635 intel_engine_dump(engine, &p,
636 "%s(%s): failed to complete request\n",
640 GEM_TRACE("%s(%s): failed to complete request\n",
647 i915_request_put(last);
650 } while (err == 0 && time_before(jiffies, end_time));
652 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
654 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
655 st_engine_heartbeat_enable(engine);
656 intel_context_put(ce);
658 if (igt_flush_test(gt->i915))
667 static int __igt_reset_engine(struct intel_gt *gt, bool active)
669 struct i915_gpu_error *global = >->i915->gpu_error;
670 struct intel_engine_cs *engine;
671 enum intel_engine_id id;
675 /* Check that we can issue an engine reset on an idle engine (no-op) */
677 if (!intel_has_reset_engine(gt))
681 err = hang_init(&h, gt);
686 for_each_engine(engine, gt, id) {
687 unsigned int reset_count, reset_engine_count;
689 IGT_TIMEOUT(end_time);
691 if (active && !intel_engine_can_store_dword(engine))
694 if (!wait_for_idle(engine)) {
695 pr_err("%s failed to idle before reset\n",
701 reset_count = i915_reset_count(global);
702 reset_engine_count = i915_reset_engine_count(global, engine);
704 st_engine_heartbeat_disable(engine);
705 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
709 struct i915_request *rq;
711 rq = hang_create_request(&h, engine);
717 i915_request_get(rq);
718 i915_request_add(rq);
720 if (!wait_until_running(&h, rq)) {
721 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
723 pr_err("%s: Failed to start request %llx, at %x\n",
724 __func__, rq->fence.seqno, hws_seqno(&h, rq));
725 intel_engine_dump(engine, &p,
726 "%s\n", engine->name);
728 i915_request_put(rq);
733 i915_request_put(rq);
736 err = intel_engine_reset(engine, NULL);
738 pr_err("intel_engine_reset(%s) failed, err:%d\n",
743 if (i915_reset_count(global) != reset_count) {
744 pr_err("Full GPU reset recorded! (engine reset expected)\n");
749 if (i915_reset_engine_count(global, engine) !=
750 ++reset_engine_count) {
751 pr_err("%s engine reset not recorded!\n",
758 } while (time_before(jiffies, end_time));
759 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
760 st_engine_heartbeat_enable(engine);
761 pr_info("%s: Completed %lu %s resets\n",
762 engine->name, count, active ? "active" : "idle");
767 err = igt_flush_test(gt->i915);
772 if (intel_gt_is_wedged(gt))
781 static int igt_reset_idle_engine(void *arg)
783 return __igt_reset_engine(arg, false);
786 static int igt_reset_active_engine(void *arg)
788 return __igt_reset_engine(arg, true);
791 struct active_engine {
792 struct task_struct *task;
793 struct intel_engine_cs *engine;
794 unsigned long resets;
798 #define TEST_ACTIVE BIT(0)
799 #define TEST_OTHERS BIT(1)
800 #define TEST_SELF BIT(2)
801 #define TEST_PRIORITY BIT(3)
803 static int active_request_put(struct i915_request *rq)
810 if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
811 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
817 intel_gt_set_wedged(rq->engine->gt);
821 i915_request_put(rq);
826 static int active_engine(void *data)
828 I915_RND_STATE(prng);
829 struct active_engine *arg = data;
830 struct intel_engine_cs *engine = arg->engine;
831 struct i915_request *rq[8] = {};
832 struct intel_context *ce[ARRAY_SIZE(rq)];
836 for (count = 0; count < ARRAY_SIZE(ce); count++) {
837 ce[count] = intel_context_create(engine);
838 if (IS_ERR(ce[count])) {
839 err = PTR_ERR(ce[count]);
841 intel_context_put(ce[count]);
847 while (!kthread_should_stop()) {
848 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
849 struct i915_request *old = rq[idx];
850 struct i915_request *new;
852 new = intel_context_create_request(ce[idx]);
858 rq[idx] = i915_request_get(new);
859 i915_request_add(new);
861 if (engine->schedule && arg->flags & TEST_PRIORITY) {
862 struct i915_sched_attr attr = {
864 i915_prandom_u32_max_state(512, &prng),
866 engine->schedule(rq[idx], &attr);
869 err = active_request_put(old);
876 for (count = 0; count < ARRAY_SIZE(rq); count++) {
877 int err__ = active_request_put(rq[count]);
879 /* Keep the first error */
883 intel_context_put(ce[count]);
889 static int __igt_reset_engines(struct intel_gt *gt,
890 const char *test_name,
893 struct i915_gpu_error *global = >->i915->gpu_error;
894 struct intel_engine_cs *engine, *other;
895 enum intel_engine_id id, tmp;
899 /* Check that issuing a reset on one engine does not interfere
900 * with any other engine.
903 if (!intel_has_reset_engine(gt))
906 if (flags & TEST_ACTIVE) {
907 err = hang_init(&h, gt);
911 if (flags & TEST_PRIORITY)
912 h.ctx->sched.priority = 1024;
915 for_each_engine(engine, gt, id) {
916 struct active_engine threads[I915_NUM_ENGINES] = {};
917 unsigned long device = i915_reset_count(global);
918 unsigned long count = 0, reported;
919 IGT_TIMEOUT(end_time);
921 if (flags & TEST_ACTIVE &&
922 !intel_engine_can_store_dword(engine))
925 if (!wait_for_idle(engine)) {
926 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
927 engine->name, test_name);
932 memset(threads, 0, sizeof(threads));
933 for_each_engine(other, gt, tmp) {
934 struct task_struct *tsk;
936 threads[tmp].resets =
937 i915_reset_engine_count(global, other);
939 if (other == engine && !(flags & TEST_SELF))
942 if (other != engine && !(flags & TEST_OTHERS))
945 threads[tmp].engine = other;
946 threads[tmp].flags = flags;
948 tsk = kthread_run(active_engine, &threads[tmp],
949 "igt/%s", other->name);
955 threads[tmp].task = tsk;
956 get_task_struct(tsk);
959 yield(); /* start all threads before we begin */
961 st_engine_heartbeat_disable(engine);
962 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
964 struct i915_request *rq = NULL;
966 if (flags & TEST_ACTIVE) {
967 rq = hang_create_request(&h, engine);
973 i915_request_get(rq);
974 i915_request_add(rq);
976 if (!wait_until_running(&h, rq)) {
977 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
979 pr_err("%s: Failed to start request %llx, at %x\n",
980 __func__, rq->fence.seqno, hws_seqno(&h, rq));
981 intel_engine_dump(engine, &p,
982 "%s\n", engine->name);
984 i915_request_put(rq);
990 err = intel_engine_reset(engine, NULL);
992 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
993 engine->name, test_name, err);
1000 if (rq->fence.error != -EIO) {
1001 pr_err("i915_reset_engine(%s:%s):"
1002 " failed to reset request %llx:%lld\n",
1003 engine->name, test_name,
1006 i915_request_put(rq);
1009 intel_gt_set_wedged(gt);
1014 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1015 struct drm_printer p =
1016 drm_info_printer(gt->i915->drm.dev);
1018 pr_err("i915_reset_engine(%s:%s):"
1019 " failed to complete request %llx:%lld after reset\n",
1020 engine->name, test_name,
1023 intel_engine_dump(engine, &p,
1024 "%s\n", engine->name);
1025 i915_request_put(rq);
1028 intel_gt_set_wedged(gt);
1033 i915_request_put(rq);
1036 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1037 struct drm_printer p =
1038 drm_info_printer(gt->i915->drm.dev);
1040 pr_err("i915_reset_engine(%s:%s):"
1041 " failed to idle after reset\n",
1042 engine->name, test_name);
1043 intel_engine_dump(engine, &p,
1044 "%s\n", engine->name);
1049 } while (time_before(jiffies, end_time));
1050 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
1051 st_engine_heartbeat_enable(engine);
1053 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1054 engine->name, test_name, count);
1056 reported = i915_reset_engine_count(global, engine);
1057 reported -= threads[engine->id].resets;
1058 if (reported != count) {
1059 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1060 engine->name, test_name, count, reported);
1066 for_each_engine(other, gt, tmp) {
1069 if (!threads[tmp].task)
1072 ret = kthread_stop(threads[tmp].task);
1074 pr_err("kthread for other engine %s failed, err=%d\n",
1079 put_task_struct(threads[tmp].task);
1081 if (other->uabi_class != engine->uabi_class &&
1082 threads[tmp].resets !=
1083 i915_reset_engine_count(global, other)) {
1084 pr_err("Innocent engine %s was reset (count=%ld)\n",
1086 i915_reset_engine_count(global, other) -
1087 threads[tmp].resets);
1093 if (device != i915_reset_count(global)) {
1094 pr_err("Global reset (count=%ld)!\n",
1095 i915_reset_count(global) - device);
1103 err = igt_flush_test(gt->i915);
1108 if (intel_gt_is_wedged(gt))
1111 if (flags & TEST_ACTIVE)
1117 static int igt_reset_engines(void *arg)
1119 static const struct {
1124 { "active", TEST_ACTIVE },
1125 { "others-idle", TEST_OTHERS },
1126 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1129 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1133 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1137 struct intel_gt *gt = arg;
1141 for (p = phases; p->name; p++) {
1142 if (p->flags & TEST_PRIORITY) {
1143 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1147 err = __igt_reset_engines(arg, p->name, p->flags);
1155 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1157 u32 count = i915_reset_count(>->i915->gpu_error);
1159 intel_gt_reset(gt, mask, NULL);
1164 static int igt_reset_wait(void *arg)
1166 struct intel_gt *gt = arg;
1167 struct i915_gpu_error *global = >->i915->gpu_error;
1168 struct intel_engine_cs *engine = gt->engine[RCS0];
1169 struct i915_request *rq;
1170 unsigned int reset_count;
1175 if (!engine || !intel_engine_can_store_dword(engine))
1178 /* Check that we detect a stuck waiter and issue a reset */
1180 igt_global_reset_lock(gt);
1182 err = hang_init(&h, gt);
1186 rq = hang_create_request(&h, engine);
1192 i915_request_get(rq);
1193 i915_request_add(rq);
1195 if (!wait_until_running(&h, rq)) {
1196 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1198 pr_err("%s: Failed to start request %llx, at %x\n",
1199 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1200 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1202 intel_gt_set_wedged(gt);
1208 reset_count = fake_hangcheck(gt, ALL_ENGINES);
1210 timeout = i915_request_wait(rq, 0, 10);
1212 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1218 if (i915_reset_count(global) == reset_count) {
1219 pr_err("No GPU reset recorded!\n");
1225 i915_request_put(rq);
1229 igt_global_reset_unlock(gt);
1231 if (intel_gt_is_wedged(gt))
1238 struct completion completion;
1239 struct i915_vma *vma;
1242 static int evict_vma(void *data)
1244 struct evict_vma *arg = data;
1245 struct i915_address_space *vm = arg->vma->vm;
1246 struct drm_mm_node evict = arg->vma->node;
1249 complete(&arg->completion);
1251 mutex_lock(&vm->mutex);
1252 err = i915_gem_evict_for_node(vm, &evict, 0);
1253 mutex_unlock(&vm->mutex);
1258 static int evict_fence(void *data)
1260 struct evict_vma *arg = data;
1263 complete(&arg->completion);
1265 /* Mark the fence register as dirty to force the mmio update. */
1266 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1268 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1272 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1274 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1278 err = i915_vma_pin_fence(arg->vma);
1279 i915_vma_unpin(arg->vma);
1281 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1285 i915_vma_unpin_fence(arg->vma);
1290 static int __igt_reset_evict_vma(struct intel_gt *gt,
1291 struct i915_address_space *vm,
1295 struct intel_engine_cs *engine = gt->engine[RCS0];
1296 struct drm_i915_gem_object *obj;
1297 struct task_struct *tsk = NULL;
1298 struct i915_request *rq;
1299 struct evict_vma arg;
1301 unsigned int pin_flags;
1304 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1307 if (!engine || !intel_engine_can_store_dword(engine))
1310 /* Check that we can recover an unbind stuck on a hanging request */
1312 err = hang_init(&h, gt);
1316 obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1322 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1323 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1325 pr_err("Invalid X-tiling settings; err:%d\n", err);
1330 arg.vma = i915_vma_instance(obj, vm, NULL);
1331 if (IS_ERR(arg.vma)) {
1332 err = PTR_ERR(arg.vma);
1336 rq = hang_create_request(&h, engine);
1342 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1344 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1345 pin_flags |= PIN_MAPPABLE;
1347 err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1349 i915_request_add(rq);
1353 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1354 err = i915_vma_pin_fence(arg.vma);
1356 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1357 i915_vma_unpin(arg.vma);
1358 i915_request_add(rq);
1363 i915_vma_lock(arg.vma);
1364 err = i915_request_await_object(rq, arg.vma->obj,
1365 flags & EXEC_OBJECT_WRITE);
1367 err = i915_vma_move_to_active(arg.vma, rq, flags);
1368 i915_vma_unlock(arg.vma);
1370 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1371 i915_vma_unpin_fence(arg.vma);
1372 i915_vma_unpin(arg.vma);
1374 i915_request_get(rq);
1375 i915_request_add(rq);
1379 if (!wait_until_running(&h, rq)) {
1380 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1382 pr_err("%s: Failed to start request %llx, at %x\n",
1383 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1384 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1386 intel_gt_set_wedged(gt);
1390 init_completion(&arg.completion);
1392 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1398 get_task_struct(tsk);
1400 wait_for_completion(&arg.completion);
1402 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1403 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1405 pr_err("igt/evict_vma kthread did not wait\n");
1406 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1408 intel_gt_set_wedged(gt);
1413 igt_global_reset_lock(gt);
1414 fake_hangcheck(gt, rq->engine->mask);
1415 igt_global_reset_unlock(gt);
1418 struct intel_wedge_me w;
1420 /* The reset, even indirectly, should take less than 10ms. */
1421 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1422 err = kthread_stop(tsk);
1424 put_task_struct(tsk);
1428 i915_request_put(rq);
1430 i915_gem_object_put(obj);
1433 if (intel_gt_is_wedged(gt))
1439 static int igt_reset_evict_ggtt(void *arg)
1441 struct intel_gt *gt = arg;
1443 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1444 evict_vma, EXEC_OBJECT_WRITE);
1447 static int igt_reset_evict_ppgtt(void *arg)
1449 struct intel_gt *gt = arg;
1450 struct i915_ppgtt *ppgtt;
1453 /* aliasing == global gtt locking, covered above */
1454 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1457 ppgtt = i915_ppgtt_create(gt);
1459 return PTR_ERR(ppgtt);
1461 err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1462 evict_vma, EXEC_OBJECT_WRITE);
1463 i915_vm_put(&ppgtt->vm);
1468 static int igt_reset_evict_fence(void *arg)
1470 struct intel_gt *gt = arg;
1472 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1473 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1476 static int wait_for_others(struct intel_gt *gt,
1477 struct intel_engine_cs *exclude)
1479 struct intel_engine_cs *engine;
1480 enum intel_engine_id id;
1482 for_each_engine(engine, gt, id) {
1483 if (engine == exclude)
1486 if (!wait_for_idle(engine))
1493 static int igt_reset_queue(void *arg)
1495 struct intel_gt *gt = arg;
1496 struct i915_gpu_error *global = >->i915->gpu_error;
1497 struct intel_engine_cs *engine;
1498 enum intel_engine_id id;
1502 /* Check that we replay pending requests following a hang */
1504 igt_global_reset_lock(gt);
1506 err = hang_init(&h, gt);
1510 for_each_engine(engine, gt, id) {
1511 struct i915_request *prev;
1512 IGT_TIMEOUT(end_time);
1515 if (!intel_engine_can_store_dword(engine))
1518 prev = hang_create_request(&h, engine);
1520 err = PTR_ERR(prev);
1524 i915_request_get(prev);
1525 i915_request_add(prev);
1529 struct i915_request *rq;
1530 unsigned int reset_count;
1532 rq = hang_create_request(&h, engine);
1538 i915_request_get(rq);
1539 i915_request_add(rq);
1542 * XXX We don't handle resetting the kernel context
1543 * very well. If we trigger a device reset twice in
1544 * quick succession while the kernel context is
1545 * executing, we may end up skipping the breadcrumb.
1546 * This is really only a problem for the selftest as
1547 * normally there is a large interlude between resets
1548 * (hangcheck), or we focus on resetting just one
1549 * engine and so avoid repeatedly resetting innocents.
1551 err = wait_for_others(gt, engine);
1553 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1554 __func__, engine->name);
1555 i915_request_put(rq);
1556 i915_request_put(prev);
1559 intel_gt_set_wedged(gt);
1563 if (!wait_until_running(&h, prev)) {
1564 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1566 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1567 __func__, engine->name,
1568 prev->fence.seqno, hws_seqno(&h, prev));
1569 intel_engine_dump(engine, &p,
1570 "%s\n", engine->name);
1572 i915_request_put(rq);
1573 i915_request_put(prev);
1575 intel_gt_set_wedged(gt);
1581 reset_count = fake_hangcheck(gt, BIT(id));
1583 if (prev->fence.error != -EIO) {
1584 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1586 i915_request_put(rq);
1587 i915_request_put(prev);
1592 if (rq->fence.error) {
1593 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1595 i915_request_put(rq);
1596 i915_request_put(prev);
1601 if (i915_reset_count(global) == reset_count) {
1602 pr_err("No GPU reset recorded!\n");
1603 i915_request_put(rq);
1604 i915_request_put(prev);
1609 i915_request_put(prev);
1612 } while (time_before(jiffies, end_time));
1613 pr_info("%s: Completed %d queued resets\n",
1614 engine->name, count);
1616 *h.batch = MI_BATCH_BUFFER_END;
1617 intel_gt_chipset_flush(engine->gt);
1619 i915_request_put(prev);
1621 err = igt_flush_test(gt->i915);
1629 igt_global_reset_unlock(gt);
1631 if (intel_gt_is_wedged(gt))
1637 static int igt_handle_error(void *arg)
1639 struct intel_gt *gt = arg;
1640 struct i915_gpu_error *global = >->i915->gpu_error;
1641 struct intel_engine_cs *engine = gt->engine[RCS0];
1643 struct i915_request *rq;
1644 struct i915_gpu_coredump *error;
1647 /* Check that we can issue a global GPU and engine reset */
1649 if (!intel_has_reset_engine(gt))
1652 if (!engine || !intel_engine_can_store_dword(engine))
1655 err = hang_init(&h, gt);
1659 rq = hang_create_request(&h, engine);
1665 i915_request_get(rq);
1666 i915_request_add(rq);
1668 if (!wait_until_running(&h, rq)) {
1669 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1671 pr_err("%s: Failed to start request %llx, at %x\n",
1672 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1673 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1675 intel_gt_set_wedged(gt);
1681 /* Temporarily disable error capture */
1682 error = xchg(&global->first_error, (void *)-1);
1684 intel_gt_handle_error(gt, engine->mask, 0, NULL);
1686 xchg(&global->first_error, error);
1688 if (rq->fence.error != -EIO) {
1689 pr_err("Guilty request not identified!\n");
1695 i915_request_put(rq);
1701 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1702 const struct igt_atomic_section *p,
1705 struct tasklet_struct * const t = &engine->execlists.tasklet;
1708 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1709 engine->name, mode, p->name);
1713 if (strcmp(p->name, "softirq"))
1715 p->critical_section_begin();
1717 err = __intel_engine_reset_bh(engine, NULL);
1719 p->critical_section_end();
1720 if (strcmp(p->name, "softirq"))
1724 tasklet_hi_schedule(t);
1728 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1729 engine->name, mode, p->name);
1734 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1735 const struct igt_atomic_section *p)
1737 struct i915_request *rq;
1741 err = __igt_atomic_reset_engine(engine, p, "idle");
1745 err = hang_init(&h, engine->gt);
1749 rq = hang_create_request(&h, engine);
1755 i915_request_get(rq);
1756 i915_request_add(rq);
1758 if (wait_until_running(&h, rq)) {
1759 err = __igt_atomic_reset_engine(engine, p, "active");
1761 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1762 __func__, engine->name,
1763 rq->fence.seqno, hws_seqno(&h, rq));
1764 intel_gt_set_wedged(engine->gt);
1769 struct intel_wedge_me w;
1771 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1772 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1773 if (intel_gt_is_wedged(engine->gt))
1777 i915_request_put(rq);
1783 static int igt_reset_engines_atomic(void *arg)
1785 struct intel_gt *gt = arg;
1786 const typeof(*igt_atomic_phases) *p;
1789 /* Check that the engines resets are usable from atomic context */
1791 if (!intel_has_reset_engine(gt))
1794 if (intel_uc_uses_guc_submission(>->uc))
1797 igt_global_reset_lock(gt);
1799 /* Flush any requests before we get started and check basics */
1800 if (!igt_force_reset(gt))
1803 for (p = igt_atomic_phases; p->name; p++) {
1804 struct intel_engine_cs *engine;
1805 enum intel_engine_id id;
1807 for_each_engine(engine, gt, id) {
1808 err = igt_atomic_reset_engine(engine, p);
1815 /* As we poke around the guts, do a full reset before continuing. */
1816 igt_force_reset(gt);
1818 igt_global_reset_unlock(gt);
1823 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1825 static const struct i915_subtest tests[] = {
1826 SUBTEST(igt_hang_sanitycheck),
1827 SUBTEST(igt_reset_nop),
1828 SUBTEST(igt_reset_nop_engine),
1829 SUBTEST(igt_reset_idle_engine),
1830 SUBTEST(igt_reset_active_engine),
1831 SUBTEST(igt_reset_fail_engine),
1832 SUBTEST(igt_reset_engines),
1833 SUBTEST(igt_reset_engines_atomic),
1834 SUBTEST(igt_reset_queue),
1835 SUBTEST(igt_reset_wait),
1836 SUBTEST(igt_reset_evict_ggtt),
1837 SUBTEST(igt_reset_evict_ppgtt),
1838 SUBTEST(igt_reset_evict_fence),
1839 SUBTEST(igt_handle_error),
1841 struct intel_gt *gt = &i915->gt;
1842 intel_wakeref_t wakeref;
1845 if (!intel_has_gpu_reset(gt))
1848 if (intel_gt_is_wedged(gt))
1849 return -EIO; /* we're long past hope of a successful reset */
1851 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1853 err = intel_gt_live_subtests(tests, gt);
1855 intel_runtime_pm_put(gt->uncore->rpm, wakeref);