drm/i915/guc: Fix outstanding G2H accounting
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / selftest_hangcheck.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5
6 #include <linux/kthread.h>
7
8 #include "gem/i915_gem_context.h"
9
10 #include "intel_gt.h"
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "selftest_engine_heartbeat.h"
14
15 #include "i915_selftest.h"
16 #include "selftests/i915_random.h"
17 #include "selftests/igt_flush_test.h"
18 #include "selftests/igt_reset.h"
19 #include "selftests/igt_atomic.h"
20 #include "selftests/igt_spinner.h"
21 #include "selftests/intel_scheduler_helpers.h"
22
23 #include "selftests/mock_drm.h"
24
25 #include "gem/selftests/mock_context.h"
26 #include "gem/selftests/igt_gem_utils.h"
27
28 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
29
30 struct hang {
31         struct intel_gt *gt;
32         struct drm_i915_gem_object *hws;
33         struct drm_i915_gem_object *obj;
34         struct i915_gem_context *ctx;
35         u32 *seqno;
36         u32 *batch;
37 };
38
39 static int hang_init(struct hang *h, struct intel_gt *gt)
40 {
41         void *vaddr;
42         int err;
43
44         memset(h, 0, sizeof(*h));
45         h->gt = gt;
46
47         h->ctx = kernel_context(gt->i915, NULL);
48         if (IS_ERR(h->ctx))
49                 return PTR_ERR(h->ctx);
50
51         GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
52
53         h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
54         if (IS_ERR(h->hws)) {
55                 err = PTR_ERR(h->hws);
56                 goto err_ctx;
57         }
58
59         h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
60         if (IS_ERR(h->obj)) {
61                 err = PTR_ERR(h->obj);
62                 goto err_hws;
63         }
64
65         i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
66         vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
67         if (IS_ERR(vaddr)) {
68                 err = PTR_ERR(vaddr);
69                 goto err_obj;
70         }
71         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
72
73         vaddr = i915_gem_object_pin_map_unlocked(h->obj,
74                                                  i915_coherent_map_type(gt->i915, h->obj, false));
75         if (IS_ERR(vaddr)) {
76                 err = PTR_ERR(vaddr);
77                 goto err_unpin_hws;
78         }
79         h->batch = vaddr;
80
81         return 0;
82
83 err_unpin_hws:
84         i915_gem_object_unpin_map(h->hws);
85 err_obj:
86         i915_gem_object_put(h->obj);
87 err_hws:
88         i915_gem_object_put(h->hws);
89 err_ctx:
90         kernel_context_close(h->ctx);
91         return err;
92 }
93
94 static u64 hws_address(const struct i915_vma *hws,
95                        const struct i915_request *rq)
96 {
97         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
98 }
99
100 static int move_to_active(struct i915_vma *vma,
101                           struct i915_request *rq,
102                           unsigned int flags)
103 {
104         int err;
105
106         i915_vma_lock(vma);
107         err = i915_request_await_object(rq, vma->obj,
108                                         flags & EXEC_OBJECT_WRITE);
109         if (err == 0)
110                 err = i915_vma_move_to_active(vma, rq, flags);
111         i915_vma_unlock(vma);
112
113         return err;
114 }
115
116 static struct i915_request *
117 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
118 {
119         struct intel_gt *gt = h->gt;
120         struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
121         struct drm_i915_gem_object *obj;
122         struct i915_request *rq = NULL;
123         struct i915_vma *hws, *vma;
124         unsigned int flags;
125         void *vaddr;
126         u32 *batch;
127         int err;
128
129         obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
130         if (IS_ERR(obj)) {
131                 i915_vm_put(vm);
132                 return ERR_CAST(obj);
133         }
134
135         vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
136         if (IS_ERR(vaddr)) {
137                 i915_gem_object_put(obj);
138                 i915_vm_put(vm);
139                 return ERR_CAST(vaddr);
140         }
141
142         i915_gem_object_unpin_map(h->obj);
143         i915_gem_object_put(h->obj);
144
145         h->obj = obj;
146         h->batch = vaddr;
147
148         vma = i915_vma_instance(h->obj, vm, NULL);
149         if (IS_ERR(vma)) {
150                 i915_vm_put(vm);
151                 return ERR_CAST(vma);
152         }
153
154         hws = i915_vma_instance(h->hws, vm, NULL);
155         if (IS_ERR(hws)) {
156                 i915_vm_put(vm);
157                 return ERR_CAST(hws);
158         }
159
160         err = i915_vma_pin(vma, 0, 0, PIN_USER);
161         if (err) {
162                 i915_vm_put(vm);
163                 return ERR_PTR(err);
164         }
165
166         err = i915_vma_pin(hws, 0, 0, PIN_USER);
167         if (err)
168                 goto unpin_vma;
169
170         rq = igt_request_alloc(h->ctx, engine);
171         if (IS_ERR(rq)) {
172                 err = PTR_ERR(rq);
173                 goto unpin_hws;
174         }
175
176         err = move_to_active(vma, rq, 0);
177         if (err)
178                 goto cancel_rq;
179
180         err = move_to_active(hws, rq, 0);
181         if (err)
182                 goto cancel_rq;
183
184         batch = h->batch;
185         if (GRAPHICS_VER(gt->i915) >= 8) {
186                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
187                 *batch++ = lower_32_bits(hws_address(hws, rq));
188                 *batch++ = upper_32_bits(hws_address(hws, rq));
189                 *batch++ = rq->fence.seqno;
190                 *batch++ = MI_NOOP;
191
192                 memset(batch, 0, 1024);
193                 batch += 1024 / sizeof(*batch);
194
195                 *batch++ = MI_NOOP;
196                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
197                 *batch++ = lower_32_bits(vma->node.start);
198                 *batch++ = upper_32_bits(vma->node.start);
199         } else if (GRAPHICS_VER(gt->i915) >= 6) {
200                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
201                 *batch++ = 0;
202                 *batch++ = lower_32_bits(hws_address(hws, rq));
203                 *batch++ = rq->fence.seqno;
204                 *batch++ = MI_NOOP;
205
206                 memset(batch, 0, 1024);
207                 batch += 1024 / sizeof(*batch);
208
209                 *batch++ = MI_NOOP;
210                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
211                 *batch++ = lower_32_bits(vma->node.start);
212         } else if (GRAPHICS_VER(gt->i915) >= 4) {
213                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
214                 *batch++ = 0;
215                 *batch++ = lower_32_bits(hws_address(hws, rq));
216                 *batch++ = rq->fence.seqno;
217                 *batch++ = MI_NOOP;
218
219                 memset(batch, 0, 1024);
220                 batch += 1024 / sizeof(*batch);
221
222                 *batch++ = MI_NOOP;
223                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
224                 *batch++ = lower_32_bits(vma->node.start);
225         } else {
226                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
227                 *batch++ = lower_32_bits(hws_address(hws, rq));
228                 *batch++ = rq->fence.seqno;
229                 *batch++ = MI_NOOP;
230
231                 memset(batch, 0, 1024);
232                 batch += 1024 / sizeof(*batch);
233
234                 *batch++ = MI_NOOP;
235                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
236                 *batch++ = lower_32_bits(vma->node.start);
237         }
238         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
239         intel_gt_chipset_flush(engine->gt);
240
241         if (rq->engine->emit_init_breadcrumb) {
242                 err = rq->engine->emit_init_breadcrumb(rq);
243                 if (err)
244                         goto cancel_rq;
245         }
246
247         flags = 0;
248         if (GRAPHICS_VER(gt->i915) <= 5)
249                 flags |= I915_DISPATCH_SECURE;
250
251         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
252
253 cancel_rq:
254         if (err) {
255                 i915_request_set_error_once(rq, err);
256                 i915_request_add(rq);
257         }
258 unpin_hws:
259         i915_vma_unpin(hws);
260 unpin_vma:
261         i915_vma_unpin(vma);
262         i915_vm_put(vm);
263         return err ? ERR_PTR(err) : rq;
264 }
265
266 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
267 {
268         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
269 }
270
271 static void hang_fini(struct hang *h)
272 {
273         *h->batch = MI_BATCH_BUFFER_END;
274         intel_gt_chipset_flush(h->gt);
275
276         i915_gem_object_unpin_map(h->obj);
277         i915_gem_object_put(h->obj);
278
279         i915_gem_object_unpin_map(h->hws);
280         i915_gem_object_put(h->hws);
281
282         kernel_context_close(h->ctx);
283
284         igt_flush_test(h->gt->i915);
285 }
286
287 static bool wait_until_running(struct hang *h, struct i915_request *rq)
288 {
289         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
290                                                rq->fence.seqno),
291                              10) &&
292                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
293                                             rq->fence.seqno),
294                           1000));
295 }
296
297 static int igt_hang_sanitycheck(void *arg)
298 {
299         struct intel_gt *gt = arg;
300         struct i915_request *rq;
301         struct intel_engine_cs *engine;
302         enum intel_engine_id id;
303         struct hang h;
304         int err;
305
306         /* Basic check that we can execute our hanging batch */
307
308         err = hang_init(&h, gt);
309         if (err)
310                 return err;
311
312         for_each_engine(engine, gt, id) {
313                 struct intel_wedge_me w;
314                 long timeout;
315
316                 if (!intel_engine_can_store_dword(engine))
317                         continue;
318
319                 rq = hang_create_request(&h, engine);
320                 if (IS_ERR(rq)) {
321                         err = PTR_ERR(rq);
322                         pr_err("Failed to create request for %s, err=%d\n",
323                                engine->name, err);
324                         goto fini;
325                 }
326
327                 i915_request_get(rq);
328
329                 *h.batch = MI_BATCH_BUFFER_END;
330                 intel_gt_chipset_flush(engine->gt);
331
332                 i915_request_add(rq);
333
334                 timeout = 0;
335                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
336                         timeout = i915_request_wait(rq, 0,
337                                                     MAX_SCHEDULE_TIMEOUT);
338                 if (intel_gt_is_wedged(gt))
339                         timeout = -EIO;
340
341                 i915_request_put(rq);
342
343                 if (timeout < 0) {
344                         err = timeout;
345                         pr_err("Wait for request failed on %s, err=%d\n",
346                                engine->name, err);
347                         goto fini;
348                 }
349         }
350
351 fini:
352         hang_fini(&h);
353         return err;
354 }
355
356 static bool wait_for_idle(struct intel_engine_cs *engine)
357 {
358         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
359 }
360
361 static int igt_reset_nop(void *arg)
362 {
363         struct intel_gt *gt = arg;
364         struct i915_gpu_error *global = &gt->i915->gpu_error;
365         struct intel_engine_cs *engine;
366         unsigned int reset_count, count;
367         enum intel_engine_id id;
368         IGT_TIMEOUT(end_time);
369         int err = 0;
370
371         /* Check that we can reset during non-user portions of requests */
372
373         reset_count = i915_reset_count(global);
374         count = 0;
375         do {
376                 for_each_engine(engine, gt, id) {
377                         struct intel_context *ce;
378                         int i;
379
380                         ce = intel_context_create(engine);
381                         if (IS_ERR(ce)) {
382                                 err = PTR_ERR(ce);
383                                 pr_err("[%s] Create context failed: %d!\n", engine->name, err);
384                                 break;
385                         }
386
387                         for (i = 0; i < 16; i++) {
388                                 struct i915_request *rq;
389
390                                 rq = intel_context_create_request(ce);
391                                 if (IS_ERR(rq)) {
392                                         err = PTR_ERR(rq);
393                                         pr_err("[%s] Create request failed: %d!\n",
394                                                engine->name, err);
395                                         break;
396                                 }
397
398                                 i915_request_add(rq);
399                         }
400
401                         intel_context_put(ce);
402                 }
403
404                 igt_global_reset_lock(gt);
405                 intel_gt_reset(gt, ALL_ENGINES, NULL);
406                 igt_global_reset_unlock(gt);
407
408                 if (intel_gt_is_wedged(gt)) {
409                         pr_err("[%s] GT is wedged!\n", engine->name);
410                         err = -EIO;
411                         break;
412                 }
413
414                 if (i915_reset_count(global) != reset_count + ++count) {
415                         pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
416                                engine->name, i915_reset_count(global), reset_count, count);
417                         err = -EINVAL;
418                         break;
419                 }
420
421                 err = igt_flush_test(gt->i915);
422                 if (err) {
423                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
424                         break;
425                 }
426         } while (time_before(jiffies, end_time));
427         pr_info("%s: %d resets\n", __func__, count);
428
429         if (igt_flush_test(gt->i915)) {
430                 pr_err("Post flush failed: %d!\n", err);
431                 err = -EIO;
432         }
433
434         return err;
435 }
436
437 static int igt_reset_nop_engine(void *arg)
438 {
439         struct intel_gt *gt = arg;
440         struct i915_gpu_error *global = &gt->i915->gpu_error;
441         struct intel_engine_cs *engine;
442         enum intel_engine_id id;
443
444         /* Check that we can engine-reset during non-user portions */
445
446         if (!intel_has_reset_engine(gt))
447                 return 0;
448
449         for_each_engine(engine, gt, id) {
450                 unsigned int reset_count, reset_engine_count, count;
451                 struct intel_context *ce;
452                 IGT_TIMEOUT(end_time);
453                 int err;
454
455                 if (intel_engine_uses_guc(engine)) {
456                         /* Engine level resets are triggered by GuC when a hang
457                          * is detected. They can't be triggered by the KMD any
458                          * more. Thus a nop batch cannot be used as a reset test
459                          */
460                         continue;
461                 }
462
463                 ce = intel_context_create(engine);
464                 if (IS_ERR(ce)) {
465                         pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
466                         return PTR_ERR(ce);
467                 }
468
469                 reset_count = i915_reset_count(global);
470                 reset_engine_count = i915_reset_engine_count(global, engine);
471                 count = 0;
472
473                 st_engine_heartbeat_disable(engine);
474                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
475                 do {
476                         int i;
477
478                         if (!wait_for_idle(engine)) {
479                                 pr_err("%s failed to idle before reset\n",
480                                        engine->name);
481                                 err = -EIO;
482                                 break;
483                         }
484
485                         for (i = 0; i < 16; i++) {
486                                 struct i915_request *rq;
487
488                                 rq = intel_context_create_request(ce);
489                                 if (IS_ERR(rq)) {
490                                         struct drm_printer p =
491                                                 drm_info_printer(gt->i915->drm.dev);
492                                         intel_engine_dump(engine, &p,
493                                                           "%s(%s): failed to submit request\n",
494                                                           __func__,
495                                                           engine->name);
496
497                                         GEM_TRACE("%s(%s): failed to submit request\n",
498                                                   __func__,
499                                                   engine->name);
500                                         GEM_TRACE_DUMP();
501
502                                         intel_gt_set_wedged(gt);
503
504                                         err = PTR_ERR(rq);
505                                         break;
506                                 }
507
508                                 i915_request_add(rq);
509                         }
510                         err = intel_engine_reset(engine, NULL);
511                         if (err) {
512                                 pr_err("intel_engine_reset(%s) failed, err:%d\n",
513                                        engine->name, err);
514                                 break;
515                         }
516
517                         if (i915_reset_count(global) != reset_count) {
518                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
519                                 err = -EINVAL;
520                                 break;
521                         }
522
523                         if (i915_reset_engine_count(global, engine) !=
524                             reset_engine_count + ++count) {
525                                 pr_err("%s engine reset not recorded!\n",
526                                        engine->name);
527                                 err = -EINVAL;
528                                 break;
529                         }
530                 } while (time_before(jiffies, end_time));
531                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
532                 st_engine_heartbeat_enable(engine);
533
534                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
535
536                 intel_context_put(ce);
537                 if (igt_flush_test(gt->i915))
538                         err = -EIO;
539                 if (err)
540                         return err;
541         }
542
543         return 0;
544 }
545
546 static void force_reset_timeout(struct intel_engine_cs *engine)
547 {
548         engine->reset_timeout.probability = 999;
549         atomic_set(&engine->reset_timeout.times, -1);
550 }
551
552 static void cancel_reset_timeout(struct intel_engine_cs *engine)
553 {
554         memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
555 }
556
557 static int igt_reset_fail_engine(void *arg)
558 {
559         struct intel_gt *gt = arg;
560         struct intel_engine_cs *engine;
561         enum intel_engine_id id;
562
563         /* Check that we can recover from engine-reset failues */
564
565         if (!intel_has_reset_engine(gt))
566                 return 0;
567
568         for_each_engine(engine, gt, id) {
569                 unsigned int count;
570                 struct intel_context *ce;
571                 IGT_TIMEOUT(end_time);
572                 int err;
573
574                 /* Can't manually break the reset if i915 doesn't perform it */
575                 if (intel_engine_uses_guc(engine))
576                         continue;
577
578                 ce = intel_context_create(engine);
579                 if (IS_ERR(ce)) {
580                         pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
581                         return PTR_ERR(ce);
582                 }
583
584                 st_engine_heartbeat_disable(engine);
585                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
586
587                 force_reset_timeout(engine);
588                 err = intel_engine_reset(engine, NULL);
589                 cancel_reset_timeout(engine);
590                 if (err == 0) /* timeouts only generated on gen8+ */
591                         goto skip;
592
593                 count = 0;
594                 do {
595                         struct i915_request *last = NULL;
596                         int i;
597
598                         if (!wait_for_idle(engine)) {
599                                 pr_err("%s failed to idle before reset\n",
600                                        engine->name);
601                                 err = -EIO;
602                                 break;
603                         }
604
605                         for (i = 0; i < count % 15; i++) {
606                                 struct i915_request *rq;
607
608                                 rq = intel_context_create_request(ce);
609                                 if (IS_ERR(rq)) {
610                                         struct drm_printer p =
611                                                 drm_info_printer(gt->i915->drm.dev);
612                                         intel_engine_dump(engine, &p,
613                                                           "%s(%s): failed to submit request\n",
614                                                           __func__,
615                                                           engine->name);
616
617                                         GEM_TRACE("%s(%s): failed to submit request\n",
618                                                   __func__,
619                                                   engine->name);
620                                         GEM_TRACE_DUMP();
621
622                                         intel_gt_set_wedged(gt);
623                                         if (last)
624                                                 i915_request_put(last);
625
626                                         err = PTR_ERR(rq);
627                                         goto out;
628                                 }
629
630                                 if (last)
631                                         i915_request_put(last);
632                                 last = i915_request_get(rq);
633                                 i915_request_add(rq);
634                         }
635
636                         if (count & 1) {
637                                 err = intel_engine_reset(engine, NULL);
638                                 if (err) {
639                                         GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
640                                                       engine->name, err);
641                                         GEM_TRACE_DUMP();
642                                         i915_request_put(last);
643                                         break;
644                                 }
645                         } else {
646                                 force_reset_timeout(engine);
647                                 err = intel_engine_reset(engine, NULL);
648                                 cancel_reset_timeout(engine);
649                                 if (err != -ETIMEDOUT) {
650                                         pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
651                                                engine->name, err);
652                                         i915_request_put(last);
653                                         break;
654                                 }
655                         }
656
657                         err = 0;
658                         if (last) {
659                                 if (i915_request_wait(last, 0, HZ / 2) < 0) {
660                                         struct drm_printer p =
661                                                 drm_info_printer(gt->i915->drm.dev);
662
663                                         intel_engine_dump(engine, &p,
664                                                           "%s(%s): failed to complete request\n",
665                                                           __func__,
666                                                           engine->name);
667
668                                         GEM_TRACE("%s(%s): failed to complete request\n",
669                                                   __func__,
670                                                   engine->name);
671                                         GEM_TRACE_DUMP();
672
673                                         err = -EIO;
674                                 }
675                                 i915_request_put(last);
676                         }
677                         count++;
678                 } while (err == 0 && time_before(jiffies, end_time));
679 out:
680                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
681 skip:
682                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
683                 st_engine_heartbeat_enable(engine);
684                 intel_context_put(ce);
685
686                 if (igt_flush_test(gt->i915))
687                         err = -EIO;
688                 if (err)
689                         return err;
690         }
691
692         return 0;
693 }
694
695 static int __igt_reset_engine(struct intel_gt *gt, bool active)
696 {
697         struct i915_gpu_error *global = &gt->i915->gpu_error;
698         struct intel_engine_cs *engine;
699         enum intel_engine_id id;
700         struct hang h;
701         int err = 0;
702
703         /* Check that we can issue an engine reset on an idle engine (no-op) */
704
705         if (!intel_has_reset_engine(gt))
706                 return 0;
707
708         if (active) {
709                 err = hang_init(&h, gt);
710                 if (err)
711                         return err;
712         }
713
714         for_each_engine(engine, gt, id) {
715                 unsigned int reset_count, reset_engine_count;
716                 unsigned long count;
717                 bool using_guc = intel_engine_uses_guc(engine);
718                 IGT_TIMEOUT(end_time);
719
720                 if (using_guc && !active)
721                         continue;
722
723                 if (active && !intel_engine_can_store_dword(engine))
724                         continue;
725
726                 if (!wait_for_idle(engine)) {
727                         pr_err("%s failed to idle before reset\n",
728                                engine->name);
729                         err = -EIO;
730                         break;
731                 }
732
733                 reset_count = i915_reset_count(global);
734                 reset_engine_count = i915_reset_engine_count(global, engine);
735
736                 st_engine_heartbeat_disable(engine);
737                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
738                 count = 0;
739                 do {
740                         struct i915_request *rq = NULL;
741                         struct intel_selftest_saved_policy saved;
742                         int err2;
743
744                         err = intel_selftest_modify_policy(engine, &saved,
745                                                            SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
746                         if (err) {
747                                 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
748                                 break;
749                         }
750
751                         if (active) {
752                                 rq = hang_create_request(&h, engine);
753                                 if (IS_ERR(rq)) {
754                                         err = PTR_ERR(rq);
755                                         pr_err("[%s] Create hang request failed: %d!\n",
756                                                engine->name, err);
757                                         goto restore;
758                                 }
759
760                                 i915_request_get(rq);
761                                 i915_request_add(rq);
762
763                                 if (!wait_until_running(&h, rq)) {
764                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
765
766                                         pr_err("%s: Failed to start request %llx, at %x\n",
767                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
768                                         intel_engine_dump(engine, &p,
769                                                           "%s\n", engine->name);
770
771                                         i915_request_put(rq);
772                                         err = -EIO;
773                                         goto restore;
774                                 }
775                         }
776
777                         if (!using_guc) {
778                                 err = intel_engine_reset(engine, NULL);
779                                 if (err) {
780                                         pr_err("intel_engine_reset(%s) failed, err:%d\n",
781                                                engine->name, err);
782                                         goto skip;
783                                 }
784                         }
785
786                         if (rq) {
787                                 /* Ensure the reset happens and kills the engine */
788                                 err = intel_selftest_wait_for_rq(rq);
789                                 if (err)
790                                         pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
791                                                engine->name, rq->fence.context,
792                                                rq->fence.seqno, rq->context->guc_id, err);
793                         }
794
795 skip:
796                         if (rq)
797                                 i915_request_put(rq);
798
799                         if (i915_reset_count(global) != reset_count) {
800                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
801                                 err = -EINVAL;
802                                 goto restore;
803                         }
804
805                         /* GuC based resets are not logged per engine */
806                         if (!using_guc) {
807                                 if (i915_reset_engine_count(global, engine) !=
808                                     ++reset_engine_count) {
809                                         pr_err("%s engine reset not recorded!\n",
810                                                engine->name);
811                                         err = -EINVAL;
812                                         goto restore;
813                                 }
814                         }
815
816                         count++;
817
818 restore:
819                         err2 = intel_selftest_restore_policy(engine, &saved);
820                         if (err2)
821                                 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
822                         if (err == 0)
823                                 err = err2;
824                         if (err)
825                                 break;
826                 } while (time_before(jiffies, end_time));
827                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
828                 st_engine_heartbeat_enable(engine);
829                 pr_info("%s: Completed %lu %s resets\n",
830                         engine->name, count, active ? "active" : "idle");
831
832                 if (err)
833                         break;
834
835                 err = igt_flush_test(gt->i915);
836                 if (err) {
837                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
838                         break;
839                 }
840         }
841
842         if (intel_gt_is_wedged(gt)) {
843                 pr_err("GT is wedged!\n");
844                 err = -EIO;
845         }
846
847         if (active)
848                 hang_fini(&h);
849
850         return err;
851 }
852
853 static int igt_reset_idle_engine(void *arg)
854 {
855         return __igt_reset_engine(arg, false);
856 }
857
858 static int igt_reset_active_engine(void *arg)
859 {
860         return __igt_reset_engine(arg, true);
861 }
862
863 struct active_engine {
864         struct task_struct *task;
865         struct intel_engine_cs *engine;
866         unsigned long resets;
867         unsigned int flags;
868 };
869
870 #define TEST_ACTIVE     BIT(0)
871 #define TEST_OTHERS     BIT(1)
872 #define TEST_SELF       BIT(2)
873 #define TEST_PRIORITY   BIT(3)
874
875 static int active_request_put(struct i915_request *rq)
876 {
877         int err = 0;
878
879         if (!rq)
880                 return 0;
881
882         if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
883                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
884                           rq->engine->name,
885                           rq->fence.context,
886                           rq->fence.seqno);
887                 GEM_TRACE_DUMP();
888
889                 intel_gt_set_wedged(rq->engine->gt);
890                 err = -EIO;
891         }
892
893         i915_request_put(rq);
894
895         return err;
896 }
897
898 static int active_engine(void *data)
899 {
900         I915_RND_STATE(prng);
901         struct active_engine *arg = data;
902         struct intel_engine_cs *engine = arg->engine;
903         struct i915_request *rq[8] = {};
904         struct intel_context *ce[ARRAY_SIZE(rq)];
905         unsigned long count;
906         int err = 0;
907
908         for (count = 0; count < ARRAY_SIZE(ce); count++) {
909                 ce[count] = intel_context_create(engine);
910                 if (IS_ERR(ce[count])) {
911                         err = PTR_ERR(ce[count]);
912                         pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
913                         while (--count)
914                                 intel_context_put(ce[count]);
915                         return err;
916                 }
917         }
918
919         count = 0;
920         while (!kthread_should_stop()) {
921                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
922                 struct i915_request *old = rq[idx];
923                 struct i915_request *new;
924
925                 new = intel_context_create_request(ce[idx]);
926                 if (IS_ERR(new)) {
927                         err = PTR_ERR(new);
928                         pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
929                         break;
930                 }
931
932                 rq[idx] = i915_request_get(new);
933                 i915_request_add(new);
934
935                 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
936                         struct i915_sched_attr attr = {
937                                 .priority =
938                                         i915_prandom_u32_max_state(512, &prng),
939                         };
940                         engine->sched_engine->schedule(rq[idx], &attr);
941                 }
942
943                 err = active_request_put(old);
944                 if (err) {
945                         pr_err("[%s] Request put failed: %d!\n", engine->name, err);
946                         break;
947                 }
948
949                 cond_resched();
950         }
951
952         for (count = 0; count < ARRAY_SIZE(rq); count++) {
953                 int err__ = active_request_put(rq[count]);
954
955                 if (err)
956                         pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
957
958                 /* Keep the first error */
959                 if (!err)
960                         err = err__;
961
962                 intel_context_put(ce[count]);
963         }
964
965         return err;
966 }
967
968 static int __igt_reset_engines(struct intel_gt *gt,
969                                const char *test_name,
970                                unsigned int flags)
971 {
972         struct i915_gpu_error *global = &gt->i915->gpu_error;
973         struct intel_engine_cs *engine, *other;
974         enum intel_engine_id id, tmp;
975         struct hang h;
976         int err = 0;
977
978         /* Check that issuing a reset on one engine does not interfere
979          * with any other engine.
980          */
981
982         if (!intel_has_reset_engine(gt))
983                 return 0;
984
985         if (flags & TEST_ACTIVE) {
986                 err = hang_init(&h, gt);
987                 if (err)
988                         return err;
989
990                 if (flags & TEST_PRIORITY)
991                         h.ctx->sched.priority = 1024;
992         }
993
994         for_each_engine(engine, gt, id) {
995                 struct active_engine threads[I915_NUM_ENGINES] = {};
996                 unsigned long device = i915_reset_count(global);
997                 unsigned long count = 0, reported;
998                 bool using_guc = intel_engine_uses_guc(engine);
999                 IGT_TIMEOUT(end_time);
1000
1001                 if (flags & TEST_ACTIVE) {
1002                         if (!intel_engine_can_store_dword(engine))
1003                                 continue;
1004                 } else if (using_guc)
1005                         continue;
1006
1007                 if (!wait_for_idle(engine)) {
1008                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1009                                engine->name, test_name);
1010                         err = -EIO;
1011                         break;
1012                 }
1013
1014                 memset(threads, 0, sizeof(threads));
1015                 for_each_engine(other, gt, tmp) {
1016                         struct task_struct *tsk;
1017
1018                         threads[tmp].resets =
1019                                 i915_reset_engine_count(global, other);
1020
1021                         if (other == engine && !(flags & TEST_SELF))
1022                                 continue;
1023
1024                         if (other != engine && !(flags & TEST_OTHERS))
1025                                 continue;
1026
1027                         threads[tmp].engine = other;
1028                         threads[tmp].flags = flags;
1029
1030                         tsk = kthread_run(active_engine, &threads[tmp],
1031                                           "igt/%s", other->name);
1032                         if (IS_ERR(tsk)) {
1033                                 err = PTR_ERR(tsk);
1034                                 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1035                                 goto unwind;
1036                         }
1037
1038                         threads[tmp].task = tsk;
1039                         get_task_struct(tsk);
1040                 }
1041
1042                 yield(); /* start all threads before we begin */
1043
1044                 st_engine_heartbeat_disable_no_pm(engine);
1045                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1046                 do {
1047                         struct i915_request *rq = NULL;
1048                         struct intel_selftest_saved_policy saved;
1049                         int err2;
1050
1051                         err = intel_selftest_modify_policy(engine, &saved,
1052                                                            SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1053                         if (err) {
1054                                 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1055                                 break;
1056                         }
1057
1058                         if (flags & TEST_ACTIVE) {
1059                                 rq = hang_create_request(&h, engine);
1060                                 if (IS_ERR(rq)) {
1061                                         err = PTR_ERR(rq);
1062                                         pr_err("[%s] Create hang request failed: %d!\n",
1063                                                engine->name, err);
1064                                         goto restore;
1065                                 }
1066
1067                                 i915_request_get(rq);
1068                                 i915_request_add(rq);
1069
1070                                 if (!wait_until_running(&h, rq)) {
1071                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1072
1073                                         pr_err("%s: Failed to start request %llx, at %x\n",
1074                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
1075                                         intel_engine_dump(engine, &p,
1076                                                           "%s\n", engine->name);
1077
1078                                         i915_request_put(rq);
1079                                         err = -EIO;
1080                                         goto restore;
1081                                 }
1082                         } else {
1083                                 intel_engine_pm_get(engine);
1084                         }
1085
1086                         if (!using_guc) {
1087                                 err = intel_engine_reset(engine, NULL);
1088                                 if (err) {
1089                                         pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1090                                                engine->name, test_name, err);
1091                                         goto restore;
1092                                 }
1093                         }
1094
1095                         if (rq) {
1096                                 /* Ensure the reset happens and kills the engine */
1097                                 err = intel_selftest_wait_for_rq(rq);
1098                                 if (err)
1099                                         pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1100                                                engine->name, rq->fence.context,
1101                                                rq->fence.seqno, rq->context->guc_id, err);
1102                         }
1103
1104                         count++;
1105
1106                         if (rq) {
1107                                 if (rq->fence.error != -EIO) {
1108                                         pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1109                                                engine->name, test_name,
1110                                                rq->fence.context,
1111                                                rq->fence.seqno, rq->context->guc_id);
1112                                         i915_request_put(rq);
1113
1114                                         GEM_TRACE_DUMP();
1115                                         intel_gt_set_wedged(gt);
1116                                         err = -EIO;
1117                                         goto restore;
1118                                 }
1119
1120                                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1121                                         struct drm_printer p =
1122                                                 drm_info_printer(gt->i915->drm.dev);
1123
1124                                         pr_err("i915_reset_engine(%s:%s):"
1125                                                " failed to complete request %llx:%lld after reset\n",
1126                                                engine->name, test_name,
1127                                                rq->fence.context,
1128                                                rq->fence.seqno);
1129                                         intel_engine_dump(engine, &p,
1130                                                           "%s\n", engine->name);
1131                                         i915_request_put(rq);
1132
1133                                         GEM_TRACE_DUMP();
1134                                         intel_gt_set_wedged(gt);
1135                                         err = -EIO;
1136                                         goto restore;
1137                                 }
1138
1139                                 i915_request_put(rq);
1140                         }
1141
1142                         if (!(flags & TEST_ACTIVE))
1143                                 intel_engine_pm_put(engine);
1144
1145                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1146                                 struct drm_printer p =
1147                                         drm_info_printer(gt->i915->drm.dev);
1148
1149                                 pr_err("i915_reset_engine(%s:%s):"
1150                                        " failed to idle after reset\n",
1151                                        engine->name, test_name);
1152                                 intel_engine_dump(engine, &p,
1153                                                   "%s\n", engine->name);
1154
1155                                 err = -EIO;
1156                                 goto restore;
1157                         }
1158
1159 restore:
1160                         err2 = intel_selftest_restore_policy(engine, &saved);
1161                         if (err2)
1162                                 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1163                         if (err == 0)
1164                                 err = err2;
1165                         if (err)
1166                                 break;
1167                 } while (time_before(jiffies, end_time));
1168                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1169                 st_engine_heartbeat_enable_no_pm(engine);
1170
1171                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1172                         engine->name, test_name, count);
1173
1174                 /* GuC based resets are not logged per engine */
1175                 if (!using_guc) {
1176                         reported = i915_reset_engine_count(global, engine);
1177                         reported -= threads[engine->id].resets;
1178                         if (reported != count) {
1179                                 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1180                                        engine->name, test_name, count, reported);
1181                                 if (!err)
1182                                         err = -EINVAL;
1183                         }
1184                 }
1185
1186 unwind:
1187                 for_each_engine(other, gt, tmp) {
1188                         int ret;
1189
1190                         if (!threads[tmp].task)
1191                                 continue;
1192
1193                         ret = kthread_stop(threads[tmp].task);
1194                         if (ret) {
1195                                 pr_err("kthread for other engine %s failed, err=%d\n",
1196                                        other->name, ret);
1197                                 if (!err)
1198                                         err = ret;
1199                         }
1200                         put_task_struct(threads[tmp].task);
1201
1202                         /* GuC based resets are not logged per engine */
1203                         if (!using_guc) {
1204                                 if (other->uabi_class != engine->uabi_class &&
1205                                     threads[tmp].resets !=
1206                                     i915_reset_engine_count(global, other)) {
1207                                         pr_err("Innocent engine %s was reset (count=%ld)\n",
1208                                                other->name,
1209                                                i915_reset_engine_count(global, other) -
1210                                                threads[tmp].resets);
1211                                         if (!err)
1212                                                 err = -EINVAL;
1213                                 }
1214                         }
1215                 }
1216
1217                 if (device != i915_reset_count(global)) {
1218                         pr_err("Global reset (count=%ld)!\n",
1219                                i915_reset_count(global) - device);
1220                         if (!err)
1221                                 err = -EINVAL;
1222                 }
1223
1224                 if (err)
1225                         break;
1226
1227                 err = igt_flush_test(gt->i915);
1228                 if (err) {
1229                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1230                         break;
1231                 }
1232         }
1233
1234         if (intel_gt_is_wedged(gt))
1235                 err = -EIO;
1236
1237         if (flags & TEST_ACTIVE)
1238                 hang_fini(&h);
1239
1240         return err;
1241 }
1242
1243 static int igt_reset_engines(void *arg)
1244 {
1245         static const struct {
1246                 const char *name;
1247                 unsigned int flags;
1248         } phases[] = {
1249                 { "idle", 0 },
1250                 { "active", TEST_ACTIVE },
1251                 { "others-idle", TEST_OTHERS },
1252                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1253                 {
1254                         "others-priority",
1255                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1256                 },
1257                 {
1258                         "self-priority",
1259                         TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1260                 },
1261                 { }
1262         };
1263         struct intel_gt *gt = arg;
1264         typeof(*phases) *p;
1265         int err;
1266
1267         for (p = phases; p->name; p++) {
1268                 if (p->flags & TEST_PRIORITY) {
1269                         if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1270                                 continue;
1271                 }
1272
1273                 err = __igt_reset_engines(arg, p->name, p->flags);
1274                 if (err)
1275                         return err;
1276         }
1277
1278         return 0;
1279 }
1280
1281 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1282 {
1283         u32 count = i915_reset_count(&gt->i915->gpu_error);
1284
1285         intel_gt_reset(gt, mask, NULL);
1286
1287         return count;
1288 }
1289
1290 static int igt_reset_wait(void *arg)
1291 {
1292         struct intel_gt *gt = arg;
1293         struct i915_gpu_error *global = &gt->i915->gpu_error;
1294         struct intel_engine_cs *engine = gt->engine[RCS0];
1295         struct i915_request *rq;
1296         unsigned int reset_count;
1297         struct hang h;
1298         long timeout;
1299         int err;
1300
1301         if (!engine || !intel_engine_can_store_dword(engine))
1302                 return 0;
1303
1304         /* Check that we detect a stuck waiter and issue a reset */
1305
1306         igt_global_reset_lock(gt);
1307
1308         err = hang_init(&h, gt);
1309         if (err) {
1310                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1311                 goto unlock;
1312         }
1313
1314         rq = hang_create_request(&h, engine);
1315         if (IS_ERR(rq)) {
1316                 err = PTR_ERR(rq);
1317                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1318                 goto fini;
1319         }
1320
1321         i915_request_get(rq);
1322         i915_request_add(rq);
1323
1324         if (!wait_until_running(&h, rq)) {
1325                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1326
1327                 pr_err("%s: Failed to start request %llx, at %x\n",
1328                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1329                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1330
1331                 intel_gt_set_wedged(gt);
1332
1333                 err = -EIO;
1334                 goto out_rq;
1335         }
1336
1337         reset_count = fake_hangcheck(gt, ALL_ENGINES);
1338
1339         timeout = i915_request_wait(rq, 0, 10);
1340         if (timeout < 0) {
1341                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1342                        timeout);
1343                 err = timeout;
1344                 goto out_rq;
1345         }
1346
1347         if (i915_reset_count(global) == reset_count) {
1348                 pr_err("No GPU reset recorded!\n");
1349                 err = -EINVAL;
1350                 goto out_rq;
1351         }
1352
1353 out_rq:
1354         i915_request_put(rq);
1355 fini:
1356         hang_fini(&h);
1357 unlock:
1358         igt_global_reset_unlock(gt);
1359
1360         if (intel_gt_is_wedged(gt))
1361                 return -EIO;
1362
1363         return err;
1364 }
1365
1366 struct evict_vma {
1367         struct completion completion;
1368         struct i915_vma *vma;
1369 };
1370
1371 static int evict_vma(void *data)
1372 {
1373         struct evict_vma *arg = data;
1374         struct i915_address_space *vm = arg->vma->vm;
1375         struct drm_mm_node evict = arg->vma->node;
1376         int err;
1377
1378         complete(&arg->completion);
1379
1380         mutex_lock(&vm->mutex);
1381         err = i915_gem_evict_for_node(vm, &evict, 0);
1382         mutex_unlock(&vm->mutex);
1383
1384         return err;
1385 }
1386
1387 static int evict_fence(void *data)
1388 {
1389         struct evict_vma *arg = data;
1390         int err;
1391
1392         complete(&arg->completion);
1393
1394         /* Mark the fence register as dirty to force the mmio update. */
1395         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1396         if (err) {
1397                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1398                 return err;
1399         }
1400
1401         err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1402         if (err) {
1403                 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1404                 return err;
1405         }
1406
1407         err = i915_vma_pin_fence(arg->vma);
1408         i915_vma_unpin(arg->vma);
1409         if (err) {
1410                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1411                 return err;
1412         }
1413
1414         i915_vma_unpin_fence(arg->vma);
1415
1416         return 0;
1417 }
1418
1419 static int __igt_reset_evict_vma(struct intel_gt *gt,
1420                                  struct i915_address_space *vm,
1421                                  int (*fn)(void *),
1422                                  unsigned int flags)
1423 {
1424         struct intel_engine_cs *engine = gt->engine[RCS0];
1425         struct drm_i915_gem_object *obj;
1426         struct task_struct *tsk = NULL;
1427         struct i915_request *rq;
1428         struct evict_vma arg;
1429         struct hang h;
1430         unsigned int pin_flags;
1431         int err;
1432
1433         if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1434                 return 0;
1435
1436         if (!engine || !intel_engine_can_store_dword(engine))
1437                 return 0;
1438
1439         /* Check that we can recover an unbind stuck on a hanging request */
1440
1441         err = hang_init(&h, gt);
1442         if (err) {
1443                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1444                 return err;
1445         }
1446
1447         obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1448         if (IS_ERR(obj)) {
1449                 err = PTR_ERR(obj);
1450                 pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1451                 goto fini;
1452         }
1453
1454         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1455                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1456                 if (err) {
1457                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1458                         goto out_obj;
1459                 }
1460         }
1461
1462         arg.vma = i915_vma_instance(obj, vm, NULL);
1463         if (IS_ERR(arg.vma)) {
1464                 err = PTR_ERR(arg.vma);
1465                 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1466                 goto out_obj;
1467         }
1468
1469         rq = hang_create_request(&h, engine);
1470         if (IS_ERR(rq)) {
1471                 err = PTR_ERR(rq);
1472                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1473                 goto out_obj;
1474         }
1475
1476         pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1477
1478         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1479                 pin_flags |= PIN_MAPPABLE;
1480
1481         err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1482         if (err) {
1483                 i915_request_add(rq);
1484                 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1485                 goto out_obj;
1486         }
1487
1488         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1489                 err = i915_vma_pin_fence(arg.vma);
1490                 if (err) {
1491                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1492                         i915_vma_unpin(arg.vma);
1493                         i915_request_add(rq);
1494                         goto out_obj;
1495                 }
1496         }
1497
1498         i915_vma_lock(arg.vma);
1499         err = i915_request_await_object(rq, arg.vma->obj,
1500                                         flags & EXEC_OBJECT_WRITE);
1501         if (err == 0) {
1502                 err = i915_vma_move_to_active(arg.vma, rq, flags);
1503                 if (err)
1504                         pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1505         } else {
1506                 pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1507         }
1508
1509         i915_vma_unlock(arg.vma);
1510
1511         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1512                 i915_vma_unpin_fence(arg.vma);
1513         i915_vma_unpin(arg.vma);
1514
1515         i915_request_get(rq);
1516         i915_request_add(rq);
1517         if (err)
1518                 goto out_rq;
1519
1520         if (!wait_until_running(&h, rq)) {
1521                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1522
1523                 pr_err("%s: Failed to start request %llx, at %x\n",
1524                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1525                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1526
1527                 intel_gt_set_wedged(gt);
1528                 goto out_reset;
1529         }
1530
1531         init_completion(&arg.completion);
1532
1533         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1534         if (IS_ERR(tsk)) {
1535                 err = PTR_ERR(tsk);
1536                 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1537                 tsk = NULL;
1538                 goto out_reset;
1539         }
1540         get_task_struct(tsk);
1541
1542         wait_for_completion(&arg.completion);
1543
1544         if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1545                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1546
1547                 pr_err("igt/evict_vma kthread did not wait\n");
1548                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1549
1550                 intel_gt_set_wedged(gt);
1551                 goto out_reset;
1552         }
1553
1554 out_reset:
1555         igt_global_reset_lock(gt);
1556         fake_hangcheck(gt, rq->engine->mask);
1557         igt_global_reset_unlock(gt);
1558
1559         if (tsk) {
1560                 struct intel_wedge_me w;
1561
1562                 /* The reset, even indirectly, should take less than 10ms. */
1563                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1564                         err = kthread_stop(tsk);
1565
1566                 put_task_struct(tsk);
1567         }
1568
1569 out_rq:
1570         i915_request_put(rq);
1571 out_obj:
1572         i915_gem_object_put(obj);
1573 fini:
1574         hang_fini(&h);
1575         if (intel_gt_is_wedged(gt))
1576                 return -EIO;
1577
1578         return err;
1579 }
1580
1581 static int igt_reset_evict_ggtt(void *arg)
1582 {
1583         struct intel_gt *gt = arg;
1584
1585         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1586                                      evict_vma, EXEC_OBJECT_WRITE);
1587 }
1588
1589 static int igt_reset_evict_ppgtt(void *arg)
1590 {
1591         struct intel_gt *gt = arg;
1592         struct i915_ppgtt *ppgtt;
1593         int err;
1594
1595         /* aliasing == global gtt locking, covered above */
1596         if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1597                 return 0;
1598
1599         ppgtt = i915_ppgtt_create(gt);
1600         if (IS_ERR(ppgtt))
1601                 return PTR_ERR(ppgtt);
1602
1603         err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1604                                     evict_vma, EXEC_OBJECT_WRITE);
1605         i915_vm_put(&ppgtt->vm);
1606
1607         return err;
1608 }
1609
1610 static int igt_reset_evict_fence(void *arg)
1611 {
1612         struct intel_gt *gt = arg;
1613
1614         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1615                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1616 }
1617
1618 static int wait_for_others(struct intel_gt *gt,
1619                            struct intel_engine_cs *exclude)
1620 {
1621         struct intel_engine_cs *engine;
1622         enum intel_engine_id id;
1623
1624         for_each_engine(engine, gt, id) {
1625                 if (engine == exclude)
1626                         continue;
1627
1628                 if (!wait_for_idle(engine))
1629                         return -EIO;
1630         }
1631
1632         return 0;
1633 }
1634
1635 static int igt_reset_queue(void *arg)
1636 {
1637         struct intel_gt *gt = arg;
1638         struct i915_gpu_error *global = &gt->i915->gpu_error;
1639         struct intel_engine_cs *engine;
1640         enum intel_engine_id id;
1641         struct hang h;
1642         int err;
1643
1644         /* Check that we replay pending requests following a hang */
1645
1646         igt_global_reset_lock(gt);
1647
1648         err = hang_init(&h, gt);
1649         if (err)
1650                 goto unlock;
1651
1652         for_each_engine(engine, gt, id) {
1653                 struct intel_selftest_saved_policy saved;
1654                 struct i915_request *prev;
1655                 IGT_TIMEOUT(end_time);
1656                 unsigned int count;
1657                 bool using_guc = intel_engine_uses_guc(engine);
1658
1659                 if (!intel_engine_can_store_dword(engine))
1660                         continue;
1661
1662                 if (using_guc) {
1663                         err = intel_selftest_modify_policy(engine, &saved,
1664                                                            SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1665                         if (err) {
1666                                 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1667                                 goto fini;
1668                         }
1669                 }
1670
1671                 prev = hang_create_request(&h, engine);
1672                 if (IS_ERR(prev)) {
1673                         err = PTR_ERR(prev);
1674                         pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1675                         goto restore;
1676                 }
1677
1678                 i915_request_get(prev);
1679                 i915_request_add(prev);
1680
1681                 count = 0;
1682                 do {
1683                         struct i915_request *rq;
1684                         unsigned int reset_count;
1685
1686                         rq = hang_create_request(&h, engine);
1687                         if (IS_ERR(rq)) {
1688                                 err = PTR_ERR(rq);
1689                                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1690                                 goto restore;
1691                         }
1692
1693                         i915_request_get(rq);
1694                         i915_request_add(rq);
1695
1696                         /*
1697                          * XXX We don't handle resetting the kernel context
1698                          * very well. If we trigger a device reset twice in
1699                          * quick succession while the kernel context is
1700                          * executing, we may end up skipping the breadcrumb.
1701                          * This is really only a problem for the selftest as
1702                          * normally there is a large interlude between resets
1703                          * (hangcheck), or we focus on resetting just one
1704                          * engine and so avoid repeatedly resetting innocents.
1705                          */
1706                         err = wait_for_others(gt, engine);
1707                         if (err) {
1708                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1709                                        __func__, engine->name);
1710                                 i915_request_put(rq);
1711                                 i915_request_put(prev);
1712
1713                                 GEM_TRACE_DUMP();
1714                                 intel_gt_set_wedged(gt);
1715                                 goto restore;
1716                         }
1717
1718                         if (!wait_until_running(&h, prev)) {
1719                                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1720
1721                                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1722                                        __func__, engine->name,
1723                                        prev->fence.seqno, hws_seqno(&h, prev));
1724                                 intel_engine_dump(engine, &p,
1725                                                   "%s\n", engine->name);
1726
1727                                 i915_request_put(rq);
1728                                 i915_request_put(prev);
1729
1730                                 intel_gt_set_wedged(gt);
1731
1732                                 err = -EIO;
1733                                 goto restore;
1734                         }
1735
1736                         reset_count = fake_hangcheck(gt, BIT(id));
1737
1738                         if (prev->fence.error != -EIO) {
1739                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1740                                        prev->fence.error);
1741                                 i915_request_put(rq);
1742                                 i915_request_put(prev);
1743                                 err = -EINVAL;
1744                                 goto restore;
1745                         }
1746
1747                         if (rq->fence.error) {
1748                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1749                                        rq->fence.error);
1750                                 i915_request_put(rq);
1751                                 i915_request_put(prev);
1752                                 err = -EINVAL;
1753                                 goto restore;
1754                         }
1755
1756                         if (i915_reset_count(global) == reset_count) {
1757                                 pr_err("No GPU reset recorded!\n");
1758                                 i915_request_put(rq);
1759                                 i915_request_put(prev);
1760                                 err = -EINVAL;
1761                                 goto restore;
1762                         }
1763
1764                         i915_request_put(prev);
1765                         prev = rq;
1766                         count++;
1767                 } while (time_before(jiffies, end_time));
1768                 pr_info("%s: Completed %d queued resets\n",
1769                         engine->name, count);
1770
1771                 *h.batch = MI_BATCH_BUFFER_END;
1772                 intel_gt_chipset_flush(engine->gt);
1773
1774                 i915_request_put(prev);
1775
1776 restore:
1777                 if (using_guc) {
1778                         int err2 = intel_selftest_restore_policy(engine, &saved);
1779
1780                         if (err2)
1781                                 pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1782                                        __func__, __LINE__, engine->name, err2);
1783                         if (err == 0)
1784                                 err = err2;
1785                 }
1786                 if (err)
1787                         goto fini;
1788
1789                 err = igt_flush_test(gt->i915);
1790                 if (err) {
1791                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1792                         break;
1793                 }
1794         }
1795
1796 fini:
1797         hang_fini(&h);
1798 unlock:
1799         igt_global_reset_unlock(gt);
1800
1801         if (intel_gt_is_wedged(gt))
1802                 return -EIO;
1803
1804         return err;
1805 }
1806
1807 static int igt_handle_error(void *arg)
1808 {
1809         struct intel_gt *gt = arg;
1810         struct i915_gpu_error *global = &gt->i915->gpu_error;
1811         struct intel_engine_cs *engine = gt->engine[RCS0];
1812         struct hang h;
1813         struct i915_request *rq;
1814         struct i915_gpu_coredump *error;
1815         int err;
1816
1817         /* Check that we can issue a global GPU and engine reset */
1818
1819         if (!intel_has_reset_engine(gt))
1820                 return 0;
1821
1822         if (!engine || !intel_engine_can_store_dword(engine))
1823                 return 0;
1824
1825         err = hang_init(&h, gt);
1826         if (err) {
1827                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1828                 return err;
1829         }
1830
1831         rq = hang_create_request(&h, engine);
1832         if (IS_ERR(rq)) {
1833                 err = PTR_ERR(rq);
1834                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1835                 goto err_fini;
1836         }
1837
1838         i915_request_get(rq);
1839         i915_request_add(rq);
1840
1841         if (!wait_until_running(&h, rq)) {
1842                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1843
1844                 pr_err("%s: Failed to start request %llx, at %x\n",
1845                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1846                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1847
1848                 intel_gt_set_wedged(gt);
1849
1850                 err = -EIO;
1851                 goto err_request;
1852         }
1853
1854         /* Temporarily disable error capture */
1855         error = xchg(&global->first_error, (void *)-1);
1856
1857         intel_gt_handle_error(gt, engine->mask, 0, NULL);
1858
1859         xchg(&global->first_error, error);
1860
1861         if (rq->fence.error != -EIO) {
1862                 pr_err("Guilty request not identified!\n");
1863                 err = -EINVAL;
1864                 goto err_request;
1865         }
1866
1867 err_request:
1868         i915_request_put(rq);
1869 err_fini:
1870         hang_fini(&h);
1871         return err;
1872 }
1873
1874 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1875                                      const struct igt_atomic_section *p,
1876                                      const char *mode)
1877 {
1878         struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1879         int err;
1880
1881         GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1882                   engine->name, mode, p->name);
1883
1884         if (t->func)
1885                 tasklet_disable(t);
1886         if (strcmp(p->name, "softirq"))
1887                 local_bh_disable();
1888         p->critical_section_begin();
1889
1890         err = __intel_engine_reset_bh(engine, NULL);
1891
1892         p->critical_section_end();
1893         if (strcmp(p->name, "softirq"))
1894                 local_bh_enable();
1895         if (t->func) {
1896                 tasklet_enable(t);
1897                 tasklet_hi_schedule(t);
1898         }
1899
1900         if (err)
1901                 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1902                        engine->name, mode, p->name);
1903
1904         return err;
1905 }
1906
1907 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1908                                    const struct igt_atomic_section *p)
1909 {
1910         struct i915_request *rq;
1911         struct hang h;
1912         int err;
1913
1914         err = __igt_atomic_reset_engine(engine, p, "idle");
1915         if (err)
1916                 return err;
1917
1918         err = hang_init(&h, engine->gt);
1919         if (err) {
1920                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1921                 return err;
1922         }
1923
1924         rq = hang_create_request(&h, engine);
1925         if (IS_ERR(rq)) {
1926                 err = PTR_ERR(rq);
1927                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1928                 goto out;
1929         }
1930
1931         i915_request_get(rq);
1932         i915_request_add(rq);
1933
1934         if (wait_until_running(&h, rq)) {
1935                 err = __igt_atomic_reset_engine(engine, p, "active");
1936         } else {
1937                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1938                        __func__, engine->name,
1939                        rq->fence.seqno, hws_seqno(&h, rq));
1940                 intel_gt_set_wedged(engine->gt);
1941                 err = -EIO;
1942         }
1943
1944         if (err == 0) {
1945                 struct intel_wedge_me w;
1946
1947                 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1948                         i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1949                 if (intel_gt_is_wedged(engine->gt))
1950                         err = -EIO;
1951         }
1952
1953         i915_request_put(rq);
1954 out:
1955         hang_fini(&h);
1956         return err;
1957 }
1958
1959 static int igt_reset_engines_atomic(void *arg)
1960 {
1961         struct intel_gt *gt = arg;
1962         const typeof(*igt_atomic_phases) *p;
1963         int err = 0;
1964
1965         /* Check that the engines resets are usable from atomic context */
1966
1967         if (!intel_has_reset_engine(gt))
1968                 return 0;
1969
1970         if (intel_uc_uses_guc_submission(&gt->uc))
1971                 return 0;
1972
1973         igt_global_reset_lock(gt);
1974
1975         /* Flush any requests before we get started and check basics */
1976         if (!igt_force_reset(gt))
1977                 goto unlock;
1978
1979         for (p = igt_atomic_phases; p->name; p++) {
1980                 struct intel_engine_cs *engine;
1981                 enum intel_engine_id id;
1982
1983                 for_each_engine(engine, gt, id) {
1984                         err = igt_atomic_reset_engine(engine, p);
1985                         if (err)
1986                                 goto out;
1987                 }
1988         }
1989
1990 out:
1991         /* As we poke around the guts, do a full reset before continuing. */
1992         igt_force_reset(gt);
1993 unlock:
1994         igt_global_reset_unlock(gt);
1995
1996         return err;
1997 }
1998
1999 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2000 {
2001         static const struct i915_subtest tests[] = {
2002                 SUBTEST(igt_hang_sanitycheck),
2003                 SUBTEST(igt_reset_nop),
2004                 SUBTEST(igt_reset_nop_engine),
2005                 SUBTEST(igt_reset_idle_engine),
2006                 SUBTEST(igt_reset_active_engine),
2007                 SUBTEST(igt_reset_fail_engine),
2008                 SUBTEST(igt_reset_engines),
2009                 SUBTEST(igt_reset_engines_atomic),
2010                 SUBTEST(igt_reset_queue),
2011                 SUBTEST(igt_reset_wait),
2012                 SUBTEST(igt_reset_evict_ggtt),
2013                 SUBTEST(igt_reset_evict_ppgtt),
2014                 SUBTEST(igt_reset_evict_fence),
2015                 SUBTEST(igt_handle_error),
2016         };
2017         struct intel_gt *gt = &i915->gt;
2018         intel_wakeref_t wakeref;
2019         int err;
2020
2021         if (!intel_has_gpu_reset(gt))
2022                 return 0;
2023
2024         if (intel_gt_is_wedged(gt))
2025                 return -EIO; /* we're long past hope of a successful reset */
2026
2027         wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2028
2029         err = intel_gt_live_subtests(tests, gt);
2030
2031         intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2032
2033         return err;
2034 }