Merge tag 'nfsd-5.14' of git://linux-nfs.org/~bfields/linux
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / selftest_hangcheck.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5
6 #include <linux/kthread.h>
7
8 #include "gem/i915_gem_context.h"
9
10 #include "intel_gt.h"
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "selftest_engine_heartbeat.h"
14
15 #include "i915_selftest.h"
16 #include "selftests/i915_random.h"
17 #include "selftests/igt_flush_test.h"
18 #include "selftests/igt_reset.h"
19 #include "selftests/igt_atomic.h"
20
21 #include "selftests/mock_drm.h"
22
23 #include "gem/selftests/mock_context.h"
24 #include "gem/selftests/igt_gem_utils.h"
25
26 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
27
28 struct hang {
29         struct intel_gt *gt;
30         struct drm_i915_gem_object *hws;
31         struct drm_i915_gem_object *obj;
32         struct i915_gem_context *ctx;
33         u32 *seqno;
34         u32 *batch;
35 };
36
37 static int hang_init(struct hang *h, struct intel_gt *gt)
38 {
39         void *vaddr;
40         int err;
41
42         memset(h, 0, sizeof(*h));
43         h->gt = gt;
44
45         h->ctx = kernel_context(gt->i915);
46         if (IS_ERR(h->ctx))
47                 return PTR_ERR(h->ctx);
48
49         GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
50
51         h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
52         if (IS_ERR(h->hws)) {
53                 err = PTR_ERR(h->hws);
54                 goto err_ctx;
55         }
56
57         h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
58         if (IS_ERR(h->obj)) {
59                 err = PTR_ERR(h->obj);
60                 goto err_hws;
61         }
62
63         i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
64         vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
65         if (IS_ERR(vaddr)) {
66                 err = PTR_ERR(vaddr);
67                 goto err_obj;
68         }
69         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
70
71         vaddr = i915_gem_object_pin_map_unlocked(h->obj,
72                                                  i915_coherent_map_type(gt->i915, h->obj, false));
73         if (IS_ERR(vaddr)) {
74                 err = PTR_ERR(vaddr);
75                 goto err_unpin_hws;
76         }
77         h->batch = vaddr;
78
79         return 0;
80
81 err_unpin_hws:
82         i915_gem_object_unpin_map(h->hws);
83 err_obj:
84         i915_gem_object_put(h->obj);
85 err_hws:
86         i915_gem_object_put(h->hws);
87 err_ctx:
88         kernel_context_close(h->ctx);
89         return err;
90 }
91
92 static u64 hws_address(const struct i915_vma *hws,
93                        const struct i915_request *rq)
94 {
95         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
96 }
97
98 static int move_to_active(struct i915_vma *vma,
99                           struct i915_request *rq,
100                           unsigned int flags)
101 {
102         int err;
103
104         i915_vma_lock(vma);
105         err = i915_request_await_object(rq, vma->obj,
106                                         flags & EXEC_OBJECT_WRITE);
107         if (err == 0)
108                 err = i915_vma_move_to_active(vma, rq, flags);
109         i915_vma_unlock(vma);
110
111         return err;
112 }
113
114 static struct i915_request *
115 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
116 {
117         struct intel_gt *gt = h->gt;
118         struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
119         struct drm_i915_gem_object *obj;
120         struct i915_request *rq = NULL;
121         struct i915_vma *hws, *vma;
122         unsigned int flags;
123         void *vaddr;
124         u32 *batch;
125         int err;
126
127         obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
128         if (IS_ERR(obj)) {
129                 i915_vm_put(vm);
130                 return ERR_CAST(obj);
131         }
132
133         vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
134         if (IS_ERR(vaddr)) {
135                 i915_gem_object_put(obj);
136                 i915_vm_put(vm);
137                 return ERR_CAST(vaddr);
138         }
139
140         i915_gem_object_unpin_map(h->obj);
141         i915_gem_object_put(h->obj);
142
143         h->obj = obj;
144         h->batch = vaddr;
145
146         vma = i915_vma_instance(h->obj, vm, NULL);
147         if (IS_ERR(vma)) {
148                 i915_vm_put(vm);
149                 return ERR_CAST(vma);
150         }
151
152         hws = i915_vma_instance(h->hws, vm, NULL);
153         if (IS_ERR(hws)) {
154                 i915_vm_put(vm);
155                 return ERR_CAST(hws);
156         }
157
158         err = i915_vma_pin(vma, 0, 0, PIN_USER);
159         if (err) {
160                 i915_vm_put(vm);
161                 return ERR_PTR(err);
162         }
163
164         err = i915_vma_pin(hws, 0, 0, PIN_USER);
165         if (err)
166                 goto unpin_vma;
167
168         rq = igt_request_alloc(h->ctx, engine);
169         if (IS_ERR(rq)) {
170                 err = PTR_ERR(rq);
171                 goto unpin_hws;
172         }
173
174         err = move_to_active(vma, rq, 0);
175         if (err)
176                 goto cancel_rq;
177
178         err = move_to_active(hws, rq, 0);
179         if (err)
180                 goto cancel_rq;
181
182         batch = h->batch;
183         if (GRAPHICS_VER(gt->i915) >= 8) {
184                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
185                 *batch++ = lower_32_bits(hws_address(hws, rq));
186                 *batch++ = upper_32_bits(hws_address(hws, rq));
187                 *batch++ = rq->fence.seqno;
188                 *batch++ = MI_NOOP;
189
190                 memset(batch, 0, 1024);
191                 batch += 1024 / sizeof(*batch);
192
193                 *batch++ = MI_NOOP;
194                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
195                 *batch++ = lower_32_bits(vma->node.start);
196                 *batch++ = upper_32_bits(vma->node.start);
197         } else if (GRAPHICS_VER(gt->i915) >= 6) {
198                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
199                 *batch++ = 0;
200                 *batch++ = lower_32_bits(hws_address(hws, rq));
201                 *batch++ = rq->fence.seqno;
202                 *batch++ = MI_NOOP;
203
204                 memset(batch, 0, 1024);
205                 batch += 1024 / sizeof(*batch);
206
207                 *batch++ = MI_NOOP;
208                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
209                 *batch++ = lower_32_bits(vma->node.start);
210         } else if (GRAPHICS_VER(gt->i915) >= 4) {
211                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
212                 *batch++ = 0;
213                 *batch++ = lower_32_bits(hws_address(hws, rq));
214                 *batch++ = rq->fence.seqno;
215                 *batch++ = MI_NOOP;
216
217                 memset(batch, 0, 1024);
218                 batch += 1024 / sizeof(*batch);
219
220                 *batch++ = MI_NOOP;
221                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
222                 *batch++ = lower_32_bits(vma->node.start);
223         } else {
224                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
225                 *batch++ = lower_32_bits(hws_address(hws, rq));
226                 *batch++ = rq->fence.seqno;
227                 *batch++ = MI_NOOP;
228
229                 memset(batch, 0, 1024);
230                 batch += 1024 / sizeof(*batch);
231
232                 *batch++ = MI_NOOP;
233                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
234                 *batch++ = lower_32_bits(vma->node.start);
235         }
236         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
237         intel_gt_chipset_flush(engine->gt);
238
239         if (rq->engine->emit_init_breadcrumb) {
240                 err = rq->engine->emit_init_breadcrumb(rq);
241                 if (err)
242                         goto cancel_rq;
243         }
244
245         flags = 0;
246         if (GRAPHICS_VER(gt->i915) <= 5)
247                 flags |= I915_DISPATCH_SECURE;
248
249         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
250
251 cancel_rq:
252         if (err) {
253                 i915_request_set_error_once(rq, err);
254                 i915_request_add(rq);
255         }
256 unpin_hws:
257         i915_vma_unpin(hws);
258 unpin_vma:
259         i915_vma_unpin(vma);
260         i915_vm_put(vm);
261         return err ? ERR_PTR(err) : rq;
262 }
263
264 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
265 {
266         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
267 }
268
269 static void hang_fini(struct hang *h)
270 {
271         *h->batch = MI_BATCH_BUFFER_END;
272         intel_gt_chipset_flush(h->gt);
273
274         i915_gem_object_unpin_map(h->obj);
275         i915_gem_object_put(h->obj);
276
277         i915_gem_object_unpin_map(h->hws);
278         i915_gem_object_put(h->hws);
279
280         kernel_context_close(h->ctx);
281
282         igt_flush_test(h->gt->i915);
283 }
284
285 static bool wait_until_running(struct hang *h, struct i915_request *rq)
286 {
287         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
288                                                rq->fence.seqno),
289                              10) &&
290                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
291                                             rq->fence.seqno),
292                           1000));
293 }
294
295 static int igt_hang_sanitycheck(void *arg)
296 {
297         struct intel_gt *gt = arg;
298         struct i915_request *rq;
299         struct intel_engine_cs *engine;
300         enum intel_engine_id id;
301         struct hang h;
302         int err;
303
304         /* Basic check that we can execute our hanging batch */
305
306         err = hang_init(&h, gt);
307         if (err)
308                 return err;
309
310         for_each_engine(engine, gt, id) {
311                 struct intel_wedge_me w;
312                 long timeout;
313
314                 if (!intel_engine_can_store_dword(engine))
315                         continue;
316
317                 rq = hang_create_request(&h, engine);
318                 if (IS_ERR(rq)) {
319                         err = PTR_ERR(rq);
320                         pr_err("Failed to create request for %s, err=%d\n",
321                                engine->name, err);
322                         goto fini;
323                 }
324
325                 i915_request_get(rq);
326
327                 *h.batch = MI_BATCH_BUFFER_END;
328                 intel_gt_chipset_flush(engine->gt);
329
330                 i915_request_add(rq);
331
332                 timeout = 0;
333                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
334                         timeout = i915_request_wait(rq, 0,
335                                                     MAX_SCHEDULE_TIMEOUT);
336                 if (intel_gt_is_wedged(gt))
337                         timeout = -EIO;
338
339                 i915_request_put(rq);
340
341                 if (timeout < 0) {
342                         err = timeout;
343                         pr_err("Wait for request failed on %s, err=%d\n",
344                                engine->name, err);
345                         goto fini;
346                 }
347         }
348
349 fini:
350         hang_fini(&h);
351         return err;
352 }
353
354 static bool wait_for_idle(struct intel_engine_cs *engine)
355 {
356         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
357 }
358
359 static int igt_reset_nop(void *arg)
360 {
361         struct intel_gt *gt = arg;
362         struct i915_gpu_error *global = &gt->i915->gpu_error;
363         struct intel_engine_cs *engine;
364         unsigned int reset_count, count;
365         enum intel_engine_id id;
366         IGT_TIMEOUT(end_time);
367         int err = 0;
368
369         /* Check that we can reset during non-user portions of requests */
370
371         reset_count = i915_reset_count(global);
372         count = 0;
373         do {
374                 for_each_engine(engine, gt, id) {
375                         struct intel_context *ce;
376                         int i;
377
378                         ce = intel_context_create(engine);
379                         if (IS_ERR(ce)) {
380                                 err = PTR_ERR(ce);
381                                 break;
382                         }
383
384                         for (i = 0; i < 16; i++) {
385                                 struct i915_request *rq;
386
387                                 rq = intel_context_create_request(ce);
388                                 if (IS_ERR(rq)) {
389                                         err = PTR_ERR(rq);
390                                         break;
391                                 }
392
393                                 i915_request_add(rq);
394                         }
395
396                         intel_context_put(ce);
397                 }
398
399                 igt_global_reset_lock(gt);
400                 intel_gt_reset(gt, ALL_ENGINES, NULL);
401                 igt_global_reset_unlock(gt);
402
403                 if (intel_gt_is_wedged(gt)) {
404                         err = -EIO;
405                         break;
406                 }
407
408                 if (i915_reset_count(global) != reset_count + ++count) {
409                         pr_err("Full GPU reset not recorded!\n");
410                         err = -EINVAL;
411                         break;
412                 }
413
414                 err = igt_flush_test(gt->i915);
415                 if (err)
416                         break;
417         } while (time_before(jiffies, end_time));
418         pr_info("%s: %d resets\n", __func__, count);
419
420         if (igt_flush_test(gt->i915))
421                 err = -EIO;
422         return err;
423 }
424
425 static int igt_reset_nop_engine(void *arg)
426 {
427         struct intel_gt *gt = arg;
428         struct i915_gpu_error *global = &gt->i915->gpu_error;
429         struct intel_engine_cs *engine;
430         enum intel_engine_id id;
431
432         /* Check that we can engine-reset during non-user portions */
433
434         if (!intel_has_reset_engine(gt))
435                 return 0;
436
437         for_each_engine(engine, gt, id) {
438                 unsigned int reset_count, reset_engine_count, count;
439                 struct intel_context *ce;
440                 IGT_TIMEOUT(end_time);
441                 int err;
442
443                 ce = intel_context_create(engine);
444                 if (IS_ERR(ce))
445                         return PTR_ERR(ce);
446
447                 reset_count = i915_reset_count(global);
448                 reset_engine_count = i915_reset_engine_count(global, engine);
449                 count = 0;
450
451                 st_engine_heartbeat_disable(engine);
452                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
453                 do {
454                         int i;
455
456                         if (!wait_for_idle(engine)) {
457                                 pr_err("%s failed to idle before reset\n",
458                                        engine->name);
459                                 err = -EIO;
460                                 break;
461                         }
462
463                         for (i = 0; i < 16; i++) {
464                                 struct i915_request *rq;
465
466                                 rq = intel_context_create_request(ce);
467                                 if (IS_ERR(rq)) {
468                                         struct drm_printer p =
469                                                 drm_info_printer(gt->i915->drm.dev);
470                                         intel_engine_dump(engine, &p,
471                                                           "%s(%s): failed to submit request\n",
472                                                           __func__,
473                                                           engine->name);
474
475                                         GEM_TRACE("%s(%s): failed to submit request\n",
476                                                   __func__,
477                                                   engine->name);
478                                         GEM_TRACE_DUMP();
479
480                                         intel_gt_set_wedged(gt);
481
482                                         err = PTR_ERR(rq);
483                                         break;
484                                 }
485
486                                 i915_request_add(rq);
487                         }
488                         err = intel_engine_reset(engine, NULL);
489                         if (err) {
490                                 pr_err("intel_engine_reset(%s) failed, err:%d\n",
491                                        engine->name, err);
492                                 break;
493                         }
494
495                         if (i915_reset_count(global) != reset_count) {
496                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
497                                 err = -EINVAL;
498                                 break;
499                         }
500
501                         if (i915_reset_engine_count(global, engine) !=
502                             reset_engine_count + ++count) {
503                                 pr_err("%s engine reset not recorded!\n",
504                                        engine->name);
505                                 err = -EINVAL;
506                                 break;
507                         }
508                 } while (time_before(jiffies, end_time));
509                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
510                 st_engine_heartbeat_enable(engine);
511
512                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
513
514                 intel_context_put(ce);
515                 if (igt_flush_test(gt->i915))
516                         err = -EIO;
517                 if (err)
518                         return err;
519         }
520
521         return 0;
522 }
523
524 static void force_reset_timeout(struct intel_engine_cs *engine)
525 {
526         engine->reset_timeout.probability = 999;
527         atomic_set(&engine->reset_timeout.times, -1);
528 }
529
530 static void cancel_reset_timeout(struct intel_engine_cs *engine)
531 {
532         memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
533 }
534
535 static int igt_reset_fail_engine(void *arg)
536 {
537         struct intel_gt *gt = arg;
538         struct intel_engine_cs *engine;
539         enum intel_engine_id id;
540
541         /* Check that we can recover from engine-reset failues */
542
543         if (!intel_has_reset_engine(gt))
544                 return 0;
545
546         for_each_engine(engine, gt, id) {
547                 unsigned int count;
548                 struct intel_context *ce;
549                 IGT_TIMEOUT(end_time);
550                 int err;
551
552                 ce = intel_context_create(engine);
553                 if (IS_ERR(ce))
554                         return PTR_ERR(ce);
555
556                 st_engine_heartbeat_disable(engine);
557                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
558
559                 force_reset_timeout(engine);
560                 err = intel_engine_reset(engine, NULL);
561                 cancel_reset_timeout(engine);
562                 if (err == 0) /* timeouts only generated on gen8+ */
563                         goto skip;
564
565                 count = 0;
566                 do {
567                         struct i915_request *last = NULL;
568                         int i;
569
570                         if (!wait_for_idle(engine)) {
571                                 pr_err("%s failed to idle before reset\n",
572                                        engine->name);
573                                 err = -EIO;
574                                 break;
575                         }
576
577                         for (i = 0; i < count % 15; i++) {
578                                 struct i915_request *rq;
579
580                                 rq = intel_context_create_request(ce);
581                                 if (IS_ERR(rq)) {
582                                         struct drm_printer p =
583                                                 drm_info_printer(gt->i915->drm.dev);
584                                         intel_engine_dump(engine, &p,
585                                                           "%s(%s): failed to submit request\n",
586                                                           __func__,
587                                                           engine->name);
588
589                                         GEM_TRACE("%s(%s): failed to submit request\n",
590                                                   __func__,
591                                                   engine->name);
592                                         GEM_TRACE_DUMP();
593
594                                         intel_gt_set_wedged(gt);
595                                         if (last)
596                                                 i915_request_put(last);
597
598                                         err = PTR_ERR(rq);
599                                         goto out;
600                                 }
601
602                                 if (last)
603                                         i915_request_put(last);
604                                 last = i915_request_get(rq);
605                                 i915_request_add(rq);
606                         }
607
608                         if (count & 1) {
609                                 err = intel_engine_reset(engine, NULL);
610                                 if (err) {
611                                         GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
612                                                       engine->name, err);
613                                         GEM_TRACE_DUMP();
614                                         i915_request_put(last);
615                                         break;
616                                 }
617                         } else {
618                                 force_reset_timeout(engine);
619                                 err = intel_engine_reset(engine, NULL);
620                                 cancel_reset_timeout(engine);
621                                 if (err != -ETIMEDOUT) {
622                                         pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
623                                                engine->name, err);
624                                         i915_request_put(last);
625                                         break;
626                                 }
627                         }
628
629                         err = 0;
630                         if (last) {
631                                 if (i915_request_wait(last, 0, HZ / 2) < 0) {
632                                         struct drm_printer p =
633                                                 drm_info_printer(gt->i915->drm.dev);
634
635                                         intel_engine_dump(engine, &p,
636                                                           "%s(%s): failed to complete request\n",
637                                                           __func__,
638                                                           engine->name);
639
640                                         GEM_TRACE("%s(%s): failed to complete request\n",
641                                                   __func__,
642                                                   engine->name);
643                                         GEM_TRACE_DUMP();
644
645                                         err = -EIO;
646                                 }
647                                 i915_request_put(last);
648                         }
649                         count++;
650                 } while (err == 0 && time_before(jiffies, end_time));
651 out:
652                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
653 skip:
654                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
655                 st_engine_heartbeat_enable(engine);
656                 intel_context_put(ce);
657
658                 if (igt_flush_test(gt->i915))
659                         err = -EIO;
660                 if (err)
661                         return err;
662         }
663
664         return 0;
665 }
666
667 static int __igt_reset_engine(struct intel_gt *gt, bool active)
668 {
669         struct i915_gpu_error *global = &gt->i915->gpu_error;
670         struct intel_engine_cs *engine;
671         enum intel_engine_id id;
672         struct hang h;
673         int err = 0;
674
675         /* Check that we can issue an engine reset on an idle engine (no-op) */
676
677         if (!intel_has_reset_engine(gt))
678                 return 0;
679
680         if (active) {
681                 err = hang_init(&h, gt);
682                 if (err)
683                         return err;
684         }
685
686         for_each_engine(engine, gt, id) {
687                 unsigned int reset_count, reset_engine_count;
688                 unsigned long count;
689                 IGT_TIMEOUT(end_time);
690
691                 if (active && !intel_engine_can_store_dword(engine))
692                         continue;
693
694                 if (!wait_for_idle(engine)) {
695                         pr_err("%s failed to idle before reset\n",
696                                engine->name);
697                         err = -EIO;
698                         break;
699                 }
700
701                 reset_count = i915_reset_count(global);
702                 reset_engine_count = i915_reset_engine_count(global, engine);
703
704                 st_engine_heartbeat_disable(engine);
705                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
706                 count = 0;
707                 do {
708                         if (active) {
709                                 struct i915_request *rq;
710
711                                 rq = hang_create_request(&h, engine);
712                                 if (IS_ERR(rq)) {
713                                         err = PTR_ERR(rq);
714                                         break;
715                                 }
716
717                                 i915_request_get(rq);
718                                 i915_request_add(rq);
719
720                                 if (!wait_until_running(&h, rq)) {
721                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
722
723                                         pr_err("%s: Failed to start request %llx, at %x\n",
724                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
725                                         intel_engine_dump(engine, &p,
726                                                           "%s\n", engine->name);
727
728                                         i915_request_put(rq);
729                                         err = -EIO;
730                                         break;
731                                 }
732
733                                 i915_request_put(rq);
734                         }
735
736                         err = intel_engine_reset(engine, NULL);
737                         if (err) {
738                                 pr_err("intel_engine_reset(%s) failed, err:%d\n",
739                                        engine->name, err);
740                                 break;
741                         }
742
743                         if (i915_reset_count(global) != reset_count) {
744                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
745                                 err = -EINVAL;
746                                 break;
747                         }
748
749                         if (i915_reset_engine_count(global, engine) !=
750                             ++reset_engine_count) {
751                                 pr_err("%s engine reset not recorded!\n",
752                                        engine->name);
753                                 err = -EINVAL;
754                                 break;
755                         }
756
757                         count++;
758                 } while (time_before(jiffies, end_time));
759                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
760                 st_engine_heartbeat_enable(engine);
761                 pr_info("%s: Completed %lu %s resets\n",
762                         engine->name, count, active ? "active" : "idle");
763
764                 if (err)
765                         break;
766
767                 err = igt_flush_test(gt->i915);
768                 if (err)
769                         break;
770         }
771
772         if (intel_gt_is_wedged(gt))
773                 err = -EIO;
774
775         if (active)
776                 hang_fini(&h);
777
778         return err;
779 }
780
781 static int igt_reset_idle_engine(void *arg)
782 {
783         return __igt_reset_engine(arg, false);
784 }
785
786 static int igt_reset_active_engine(void *arg)
787 {
788         return __igt_reset_engine(arg, true);
789 }
790
791 struct active_engine {
792         struct task_struct *task;
793         struct intel_engine_cs *engine;
794         unsigned long resets;
795         unsigned int flags;
796 };
797
798 #define TEST_ACTIVE     BIT(0)
799 #define TEST_OTHERS     BIT(1)
800 #define TEST_SELF       BIT(2)
801 #define TEST_PRIORITY   BIT(3)
802
803 static int active_request_put(struct i915_request *rq)
804 {
805         int err = 0;
806
807         if (!rq)
808                 return 0;
809
810         if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
811                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
812                           rq->engine->name,
813                           rq->fence.context,
814                           rq->fence.seqno);
815                 GEM_TRACE_DUMP();
816
817                 intel_gt_set_wedged(rq->engine->gt);
818                 err = -EIO;
819         }
820
821         i915_request_put(rq);
822
823         return err;
824 }
825
826 static int active_engine(void *data)
827 {
828         I915_RND_STATE(prng);
829         struct active_engine *arg = data;
830         struct intel_engine_cs *engine = arg->engine;
831         struct i915_request *rq[8] = {};
832         struct intel_context *ce[ARRAY_SIZE(rq)];
833         unsigned long count;
834         int err = 0;
835
836         for (count = 0; count < ARRAY_SIZE(ce); count++) {
837                 ce[count] = intel_context_create(engine);
838                 if (IS_ERR(ce[count])) {
839                         err = PTR_ERR(ce[count]);
840                         while (--count)
841                                 intel_context_put(ce[count]);
842                         return err;
843                 }
844         }
845
846         count = 0;
847         while (!kthread_should_stop()) {
848                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
849                 struct i915_request *old = rq[idx];
850                 struct i915_request *new;
851
852                 new = intel_context_create_request(ce[idx]);
853                 if (IS_ERR(new)) {
854                         err = PTR_ERR(new);
855                         break;
856                 }
857
858                 rq[idx] = i915_request_get(new);
859                 i915_request_add(new);
860
861                 if (engine->schedule && arg->flags & TEST_PRIORITY) {
862                         struct i915_sched_attr attr = {
863                                 .priority =
864                                         i915_prandom_u32_max_state(512, &prng),
865                         };
866                         engine->schedule(rq[idx], &attr);
867                 }
868
869                 err = active_request_put(old);
870                 if (err)
871                         break;
872
873                 cond_resched();
874         }
875
876         for (count = 0; count < ARRAY_SIZE(rq); count++) {
877                 int err__ = active_request_put(rq[count]);
878
879                 /* Keep the first error */
880                 if (!err)
881                         err = err__;
882
883                 intel_context_put(ce[count]);
884         }
885
886         return err;
887 }
888
889 static int __igt_reset_engines(struct intel_gt *gt,
890                                const char *test_name,
891                                unsigned int flags)
892 {
893         struct i915_gpu_error *global = &gt->i915->gpu_error;
894         struct intel_engine_cs *engine, *other;
895         enum intel_engine_id id, tmp;
896         struct hang h;
897         int err = 0;
898
899         /* Check that issuing a reset on one engine does not interfere
900          * with any other engine.
901          */
902
903         if (!intel_has_reset_engine(gt))
904                 return 0;
905
906         if (flags & TEST_ACTIVE) {
907                 err = hang_init(&h, gt);
908                 if (err)
909                         return err;
910
911                 if (flags & TEST_PRIORITY)
912                         h.ctx->sched.priority = 1024;
913         }
914
915         for_each_engine(engine, gt, id) {
916                 struct active_engine threads[I915_NUM_ENGINES] = {};
917                 unsigned long device = i915_reset_count(global);
918                 unsigned long count = 0, reported;
919                 IGT_TIMEOUT(end_time);
920
921                 if (flags & TEST_ACTIVE &&
922                     !intel_engine_can_store_dword(engine))
923                         continue;
924
925                 if (!wait_for_idle(engine)) {
926                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
927                                engine->name, test_name);
928                         err = -EIO;
929                         break;
930                 }
931
932                 memset(threads, 0, sizeof(threads));
933                 for_each_engine(other, gt, tmp) {
934                         struct task_struct *tsk;
935
936                         threads[tmp].resets =
937                                 i915_reset_engine_count(global, other);
938
939                         if (other == engine && !(flags & TEST_SELF))
940                                 continue;
941
942                         if (other != engine && !(flags & TEST_OTHERS))
943                                 continue;
944
945                         threads[tmp].engine = other;
946                         threads[tmp].flags = flags;
947
948                         tsk = kthread_run(active_engine, &threads[tmp],
949                                           "igt/%s", other->name);
950                         if (IS_ERR(tsk)) {
951                                 err = PTR_ERR(tsk);
952                                 goto unwind;
953                         }
954
955                         threads[tmp].task = tsk;
956                         get_task_struct(tsk);
957                 }
958
959                 yield(); /* start all threads before we begin */
960
961                 st_engine_heartbeat_disable(engine);
962                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
963                 do {
964                         struct i915_request *rq = NULL;
965
966                         if (flags & TEST_ACTIVE) {
967                                 rq = hang_create_request(&h, engine);
968                                 if (IS_ERR(rq)) {
969                                         err = PTR_ERR(rq);
970                                         break;
971                                 }
972
973                                 i915_request_get(rq);
974                                 i915_request_add(rq);
975
976                                 if (!wait_until_running(&h, rq)) {
977                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
978
979                                         pr_err("%s: Failed to start request %llx, at %x\n",
980                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
981                                         intel_engine_dump(engine, &p,
982                                                           "%s\n", engine->name);
983
984                                         i915_request_put(rq);
985                                         err = -EIO;
986                                         break;
987                                 }
988                         }
989
990                         err = intel_engine_reset(engine, NULL);
991                         if (err) {
992                                 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
993                                        engine->name, test_name, err);
994                                 break;
995                         }
996
997                         count++;
998
999                         if (rq) {
1000                                 if (rq->fence.error != -EIO) {
1001                                         pr_err("i915_reset_engine(%s:%s):"
1002                                                " failed to reset request %llx:%lld\n",
1003                                                engine->name, test_name,
1004                                                rq->fence.context,
1005                                                rq->fence.seqno);
1006                                         i915_request_put(rq);
1007
1008                                         GEM_TRACE_DUMP();
1009                                         intel_gt_set_wedged(gt);
1010                                         err = -EIO;
1011                                         break;
1012                                 }
1013
1014                                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1015                                         struct drm_printer p =
1016                                                 drm_info_printer(gt->i915->drm.dev);
1017
1018                                         pr_err("i915_reset_engine(%s:%s):"
1019                                                " failed to complete request %llx:%lld after reset\n",
1020                                                engine->name, test_name,
1021                                                rq->fence.context,
1022                                                rq->fence.seqno);
1023                                         intel_engine_dump(engine, &p,
1024                                                           "%s\n", engine->name);
1025                                         i915_request_put(rq);
1026
1027                                         GEM_TRACE_DUMP();
1028                                         intel_gt_set_wedged(gt);
1029                                         err = -EIO;
1030                                         break;
1031                                 }
1032
1033                                 i915_request_put(rq);
1034                         }
1035
1036                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1037                                 struct drm_printer p =
1038                                         drm_info_printer(gt->i915->drm.dev);
1039
1040                                 pr_err("i915_reset_engine(%s:%s):"
1041                                        " failed to idle after reset\n",
1042                                        engine->name, test_name);
1043                                 intel_engine_dump(engine, &p,
1044                                                   "%s\n", engine->name);
1045
1046                                 err = -EIO;
1047                                 break;
1048                         }
1049                 } while (time_before(jiffies, end_time));
1050                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1051                 st_engine_heartbeat_enable(engine);
1052
1053                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1054                         engine->name, test_name, count);
1055
1056                 reported = i915_reset_engine_count(global, engine);
1057                 reported -= threads[engine->id].resets;
1058                 if (reported != count) {
1059                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1060                                engine->name, test_name, count, reported);
1061                         if (!err)
1062                                 err = -EINVAL;
1063                 }
1064
1065 unwind:
1066                 for_each_engine(other, gt, tmp) {
1067                         int ret;
1068
1069                         if (!threads[tmp].task)
1070                                 continue;
1071
1072                         ret = kthread_stop(threads[tmp].task);
1073                         if (ret) {
1074                                 pr_err("kthread for other engine %s failed, err=%d\n",
1075                                        other->name, ret);
1076                                 if (!err)
1077                                         err = ret;
1078                         }
1079                         put_task_struct(threads[tmp].task);
1080
1081                         if (other->uabi_class != engine->uabi_class &&
1082                             threads[tmp].resets !=
1083                             i915_reset_engine_count(global, other)) {
1084                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
1085                                        other->name,
1086                                        i915_reset_engine_count(global, other) -
1087                                        threads[tmp].resets);
1088                                 if (!err)
1089                                         err = -EINVAL;
1090                         }
1091                 }
1092
1093                 if (device != i915_reset_count(global)) {
1094                         pr_err("Global reset (count=%ld)!\n",
1095                                i915_reset_count(global) - device);
1096                         if (!err)
1097                                 err = -EINVAL;
1098                 }
1099
1100                 if (err)
1101                         break;
1102
1103                 err = igt_flush_test(gt->i915);
1104                 if (err)
1105                         break;
1106         }
1107
1108         if (intel_gt_is_wedged(gt))
1109                 err = -EIO;
1110
1111         if (flags & TEST_ACTIVE)
1112                 hang_fini(&h);
1113
1114         return err;
1115 }
1116
1117 static int igt_reset_engines(void *arg)
1118 {
1119         static const struct {
1120                 const char *name;
1121                 unsigned int flags;
1122         } phases[] = {
1123                 { "idle", 0 },
1124                 { "active", TEST_ACTIVE },
1125                 { "others-idle", TEST_OTHERS },
1126                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1127                 {
1128                         "others-priority",
1129                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1130                 },
1131                 {
1132                         "self-priority",
1133                         TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1134                 },
1135                 { }
1136         };
1137         struct intel_gt *gt = arg;
1138         typeof(*phases) *p;
1139         int err;
1140
1141         for (p = phases; p->name; p++) {
1142                 if (p->flags & TEST_PRIORITY) {
1143                         if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1144                                 continue;
1145                 }
1146
1147                 err = __igt_reset_engines(arg, p->name, p->flags);
1148                 if (err)
1149                         return err;
1150         }
1151
1152         return 0;
1153 }
1154
1155 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1156 {
1157         u32 count = i915_reset_count(&gt->i915->gpu_error);
1158
1159         intel_gt_reset(gt, mask, NULL);
1160
1161         return count;
1162 }
1163
1164 static int igt_reset_wait(void *arg)
1165 {
1166         struct intel_gt *gt = arg;
1167         struct i915_gpu_error *global = &gt->i915->gpu_error;
1168         struct intel_engine_cs *engine = gt->engine[RCS0];
1169         struct i915_request *rq;
1170         unsigned int reset_count;
1171         struct hang h;
1172         long timeout;
1173         int err;
1174
1175         if (!engine || !intel_engine_can_store_dword(engine))
1176                 return 0;
1177
1178         /* Check that we detect a stuck waiter and issue a reset */
1179
1180         igt_global_reset_lock(gt);
1181
1182         err = hang_init(&h, gt);
1183         if (err)
1184                 goto unlock;
1185
1186         rq = hang_create_request(&h, engine);
1187         if (IS_ERR(rq)) {
1188                 err = PTR_ERR(rq);
1189                 goto fini;
1190         }
1191
1192         i915_request_get(rq);
1193         i915_request_add(rq);
1194
1195         if (!wait_until_running(&h, rq)) {
1196                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1197
1198                 pr_err("%s: Failed to start request %llx, at %x\n",
1199                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1200                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1201
1202                 intel_gt_set_wedged(gt);
1203
1204                 err = -EIO;
1205                 goto out_rq;
1206         }
1207
1208         reset_count = fake_hangcheck(gt, ALL_ENGINES);
1209
1210         timeout = i915_request_wait(rq, 0, 10);
1211         if (timeout < 0) {
1212                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1213                        timeout);
1214                 err = timeout;
1215                 goto out_rq;
1216         }
1217
1218         if (i915_reset_count(global) == reset_count) {
1219                 pr_err("No GPU reset recorded!\n");
1220                 err = -EINVAL;
1221                 goto out_rq;
1222         }
1223
1224 out_rq:
1225         i915_request_put(rq);
1226 fini:
1227         hang_fini(&h);
1228 unlock:
1229         igt_global_reset_unlock(gt);
1230
1231         if (intel_gt_is_wedged(gt))
1232                 return -EIO;
1233
1234         return err;
1235 }
1236
1237 struct evict_vma {
1238         struct completion completion;
1239         struct i915_vma *vma;
1240 };
1241
1242 static int evict_vma(void *data)
1243 {
1244         struct evict_vma *arg = data;
1245         struct i915_address_space *vm = arg->vma->vm;
1246         struct drm_mm_node evict = arg->vma->node;
1247         int err;
1248
1249         complete(&arg->completion);
1250
1251         mutex_lock(&vm->mutex);
1252         err = i915_gem_evict_for_node(vm, &evict, 0);
1253         mutex_unlock(&vm->mutex);
1254
1255         return err;
1256 }
1257
1258 static int evict_fence(void *data)
1259 {
1260         struct evict_vma *arg = data;
1261         int err;
1262
1263         complete(&arg->completion);
1264
1265         /* Mark the fence register as dirty to force the mmio update. */
1266         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1267         if (err) {
1268                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1269                 return err;
1270         }
1271
1272         err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1273         if (err) {
1274                 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1275                 return err;
1276         }
1277
1278         err = i915_vma_pin_fence(arg->vma);
1279         i915_vma_unpin(arg->vma);
1280         if (err) {
1281                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1282                 return err;
1283         }
1284
1285         i915_vma_unpin_fence(arg->vma);
1286
1287         return 0;
1288 }
1289
1290 static int __igt_reset_evict_vma(struct intel_gt *gt,
1291                                  struct i915_address_space *vm,
1292                                  int (*fn)(void *),
1293                                  unsigned int flags)
1294 {
1295         struct intel_engine_cs *engine = gt->engine[RCS0];
1296         struct drm_i915_gem_object *obj;
1297         struct task_struct *tsk = NULL;
1298         struct i915_request *rq;
1299         struct evict_vma arg;
1300         struct hang h;
1301         unsigned int pin_flags;
1302         int err;
1303
1304         if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1305                 return 0;
1306
1307         if (!engine || !intel_engine_can_store_dword(engine))
1308                 return 0;
1309
1310         /* Check that we can recover an unbind stuck on a hanging request */
1311
1312         err = hang_init(&h, gt);
1313         if (err)
1314                 return err;
1315
1316         obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1317         if (IS_ERR(obj)) {
1318                 err = PTR_ERR(obj);
1319                 goto fini;
1320         }
1321
1322         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1323                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1324                 if (err) {
1325                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1326                         goto out_obj;
1327                 }
1328         }
1329
1330         arg.vma = i915_vma_instance(obj, vm, NULL);
1331         if (IS_ERR(arg.vma)) {
1332                 err = PTR_ERR(arg.vma);
1333                 goto out_obj;
1334         }
1335
1336         rq = hang_create_request(&h, engine);
1337         if (IS_ERR(rq)) {
1338                 err = PTR_ERR(rq);
1339                 goto out_obj;
1340         }
1341
1342         pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1343
1344         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1345                 pin_flags |= PIN_MAPPABLE;
1346
1347         err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1348         if (err) {
1349                 i915_request_add(rq);
1350                 goto out_obj;
1351         }
1352
1353         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1354                 err = i915_vma_pin_fence(arg.vma);
1355                 if (err) {
1356                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1357                         i915_vma_unpin(arg.vma);
1358                         i915_request_add(rq);
1359                         goto out_obj;
1360                 }
1361         }
1362
1363         i915_vma_lock(arg.vma);
1364         err = i915_request_await_object(rq, arg.vma->obj,
1365                                         flags & EXEC_OBJECT_WRITE);
1366         if (err == 0)
1367                 err = i915_vma_move_to_active(arg.vma, rq, flags);
1368         i915_vma_unlock(arg.vma);
1369
1370         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1371                 i915_vma_unpin_fence(arg.vma);
1372         i915_vma_unpin(arg.vma);
1373
1374         i915_request_get(rq);
1375         i915_request_add(rq);
1376         if (err)
1377                 goto out_rq;
1378
1379         if (!wait_until_running(&h, rq)) {
1380                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1381
1382                 pr_err("%s: Failed to start request %llx, at %x\n",
1383                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1384                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1385
1386                 intel_gt_set_wedged(gt);
1387                 goto out_reset;
1388         }
1389
1390         init_completion(&arg.completion);
1391
1392         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1393         if (IS_ERR(tsk)) {
1394                 err = PTR_ERR(tsk);
1395                 tsk = NULL;
1396                 goto out_reset;
1397         }
1398         get_task_struct(tsk);
1399
1400         wait_for_completion(&arg.completion);
1401
1402         if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1403                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1404
1405                 pr_err("igt/evict_vma kthread did not wait\n");
1406                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1407
1408                 intel_gt_set_wedged(gt);
1409                 goto out_reset;
1410         }
1411
1412 out_reset:
1413         igt_global_reset_lock(gt);
1414         fake_hangcheck(gt, rq->engine->mask);
1415         igt_global_reset_unlock(gt);
1416
1417         if (tsk) {
1418                 struct intel_wedge_me w;
1419
1420                 /* The reset, even indirectly, should take less than 10ms. */
1421                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1422                         err = kthread_stop(tsk);
1423
1424                 put_task_struct(tsk);
1425         }
1426
1427 out_rq:
1428         i915_request_put(rq);
1429 out_obj:
1430         i915_gem_object_put(obj);
1431 fini:
1432         hang_fini(&h);
1433         if (intel_gt_is_wedged(gt))
1434                 return -EIO;
1435
1436         return err;
1437 }
1438
1439 static int igt_reset_evict_ggtt(void *arg)
1440 {
1441         struct intel_gt *gt = arg;
1442
1443         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1444                                      evict_vma, EXEC_OBJECT_WRITE);
1445 }
1446
1447 static int igt_reset_evict_ppgtt(void *arg)
1448 {
1449         struct intel_gt *gt = arg;
1450         struct i915_ppgtt *ppgtt;
1451         int err;
1452
1453         /* aliasing == global gtt locking, covered above */
1454         if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1455                 return 0;
1456
1457         ppgtt = i915_ppgtt_create(gt);
1458         if (IS_ERR(ppgtt))
1459                 return PTR_ERR(ppgtt);
1460
1461         err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1462                                     evict_vma, EXEC_OBJECT_WRITE);
1463         i915_vm_put(&ppgtt->vm);
1464
1465         return err;
1466 }
1467
1468 static int igt_reset_evict_fence(void *arg)
1469 {
1470         struct intel_gt *gt = arg;
1471
1472         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1473                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1474 }
1475
1476 static int wait_for_others(struct intel_gt *gt,
1477                            struct intel_engine_cs *exclude)
1478 {
1479         struct intel_engine_cs *engine;
1480         enum intel_engine_id id;
1481
1482         for_each_engine(engine, gt, id) {
1483                 if (engine == exclude)
1484                         continue;
1485
1486                 if (!wait_for_idle(engine))
1487                         return -EIO;
1488         }
1489
1490         return 0;
1491 }
1492
1493 static int igt_reset_queue(void *arg)
1494 {
1495         struct intel_gt *gt = arg;
1496         struct i915_gpu_error *global = &gt->i915->gpu_error;
1497         struct intel_engine_cs *engine;
1498         enum intel_engine_id id;
1499         struct hang h;
1500         int err;
1501
1502         /* Check that we replay pending requests following a hang */
1503
1504         igt_global_reset_lock(gt);
1505
1506         err = hang_init(&h, gt);
1507         if (err)
1508                 goto unlock;
1509
1510         for_each_engine(engine, gt, id) {
1511                 struct i915_request *prev;
1512                 IGT_TIMEOUT(end_time);
1513                 unsigned int count;
1514
1515                 if (!intel_engine_can_store_dword(engine))
1516                         continue;
1517
1518                 prev = hang_create_request(&h, engine);
1519                 if (IS_ERR(prev)) {
1520                         err = PTR_ERR(prev);
1521                         goto fini;
1522                 }
1523
1524                 i915_request_get(prev);
1525                 i915_request_add(prev);
1526
1527                 count = 0;
1528                 do {
1529                         struct i915_request *rq;
1530                         unsigned int reset_count;
1531
1532                         rq = hang_create_request(&h, engine);
1533                         if (IS_ERR(rq)) {
1534                                 err = PTR_ERR(rq);
1535                                 goto fini;
1536                         }
1537
1538                         i915_request_get(rq);
1539                         i915_request_add(rq);
1540
1541                         /*
1542                          * XXX We don't handle resetting the kernel context
1543                          * very well. If we trigger a device reset twice in
1544                          * quick succession while the kernel context is
1545                          * executing, we may end up skipping the breadcrumb.
1546                          * This is really only a problem for the selftest as
1547                          * normally there is a large interlude between resets
1548                          * (hangcheck), or we focus on resetting just one
1549                          * engine and so avoid repeatedly resetting innocents.
1550                          */
1551                         err = wait_for_others(gt, engine);
1552                         if (err) {
1553                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1554                                        __func__, engine->name);
1555                                 i915_request_put(rq);
1556                                 i915_request_put(prev);
1557
1558                                 GEM_TRACE_DUMP();
1559                                 intel_gt_set_wedged(gt);
1560                                 goto fini;
1561                         }
1562
1563                         if (!wait_until_running(&h, prev)) {
1564                                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1565
1566                                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1567                                        __func__, engine->name,
1568                                        prev->fence.seqno, hws_seqno(&h, prev));
1569                                 intel_engine_dump(engine, &p,
1570                                                   "%s\n", engine->name);
1571
1572                                 i915_request_put(rq);
1573                                 i915_request_put(prev);
1574
1575                                 intel_gt_set_wedged(gt);
1576
1577                                 err = -EIO;
1578                                 goto fini;
1579                         }
1580
1581                         reset_count = fake_hangcheck(gt, BIT(id));
1582
1583                         if (prev->fence.error != -EIO) {
1584                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1585                                        prev->fence.error);
1586                                 i915_request_put(rq);
1587                                 i915_request_put(prev);
1588                                 err = -EINVAL;
1589                                 goto fini;
1590                         }
1591
1592                         if (rq->fence.error) {
1593                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1594                                        rq->fence.error);
1595                                 i915_request_put(rq);
1596                                 i915_request_put(prev);
1597                                 err = -EINVAL;
1598                                 goto fini;
1599                         }
1600
1601                         if (i915_reset_count(global) == reset_count) {
1602                                 pr_err("No GPU reset recorded!\n");
1603                                 i915_request_put(rq);
1604                                 i915_request_put(prev);
1605                                 err = -EINVAL;
1606                                 goto fini;
1607                         }
1608
1609                         i915_request_put(prev);
1610                         prev = rq;
1611                         count++;
1612                 } while (time_before(jiffies, end_time));
1613                 pr_info("%s: Completed %d queued resets\n",
1614                         engine->name, count);
1615
1616                 *h.batch = MI_BATCH_BUFFER_END;
1617                 intel_gt_chipset_flush(engine->gt);
1618
1619                 i915_request_put(prev);
1620
1621                 err = igt_flush_test(gt->i915);
1622                 if (err)
1623                         break;
1624         }
1625
1626 fini:
1627         hang_fini(&h);
1628 unlock:
1629         igt_global_reset_unlock(gt);
1630
1631         if (intel_gt_is_wedged(gt))
1632                 return -EIO;
1633
1634         return err;
1635 }
1636
1637 static int igt_handle_error(void *arg)
1638 {
1639         struct intel_gt *gt = arg;
1640         struct i915_gpu_error *global = &gt->i915->gpu_error;
1641         struct intel_engine_cs *engine = gt->engine[RCS0];
1642         struct hang h;
1643         struct i915_request *rq;
1644         struct i915_gpu_coredump *error;
1645         int err;
1646
1647         /* Check that we can issue a global GPU and engine reset */
1648
1649         if (!intel_has_reset_engine(gt))
1650                 return 0;
1651
1652         if (!engine || !intel_engine_can_store_dword(engine))
1653                 return 0;
1654
1655         err = hang_init(&h, gt);
1656         if (err)
1657                 return err;
1658
1659         rq = hang_create_request(&h, engine);
1660         if (IS_ERR(rq)) {
1661                 err = PTR_ERR(rq);
1662                 goto err_fini;
1663         }
1664
1665         i915_request_get(rq);
1666         i915_request_add(rq);
1667
1668         if (!wait_until_running(&h, rq)) {
1669                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1670
1671                 pr_err("%s: Failed to start request %llx, at %x\n",
1672                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1673                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1674
1675                 intel_gt_set_wedged(gt);
1676
1677                 err = -EIO;
1678                 goto err_request;
1679         }
1680
1681         /* Temporarily disable error capture */
1682         error = xchg(&global->first_error, (void *)-1);
1683
1684         intel_gt_handle_error(gt, engine->mask, 0, NULL);
1685
1686         xchg(&global->first_error, error);
1687
1688         if (rq->fence.error != -EIO) {
1689                 pr_err("Guilty request not identified!\n");
1690                 err = -EINVAL;
1691                 goto err_request;
1692         }
1693
1694 err_request:
1695         i915_request_put(rq);
1696 err_fini:
1697         hang_fini(&h);
1698         return err;
1699 }
1700
1701 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1702                                      const struct igt_atomic_section *p,
1703                                      const char *mode)
1704 {
1705         struct tasklet_struct * const t = &engine->execlists.tasklet;
1706         int err;
1707
1708         GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1709                   engine->name, mode, p->name);
1710
1711         if (t->func)
1712                 tasklet_disable(t);
1713         if (strcmp(p->name, "softirq"))
1714                 local_bh_disable();
1715         p->critical_section_begin();
1716
1717         err = __intel_engine_reset_bh(engine, NULL);
1718
1719         p->critical_section_end();
1720         if (strcmp(p->name, "softirq"))
1721                 local_bh_enable();
1722         if (t->func) {
1723                 tasklet_enable(t);
1724                 tasklet_hi_schedule(t);
1725         }
1726
1727         if (err)
1728                 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1729                        engine->name, mode, p->name);
1730
1731         return err;
1732 }
1733
1734 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1735                                    const struct igt_atomic_section *p)
1736 {
1737         struct i915_request *rq;
1738         struct hang h;
1739         int err;
1740
1741         err = __igt_atomic_reset_engine(engine, p, "idle");
1742         if (err)
1743                 return err;
1744
1745         err = hang_init(&h, engine->gt);
1746         if (err)
1747                 return err;
1748
1749         rq = hang_create_request(&h, engine);
1750         if (IS_ERR(rq)) {
1751                 err = PTR_ERR(rq);
1752                 goto out;
1753         }
1754
1755         i915_request_get(rq);
1756         i915_request_add(rq);
1757
1758         if (wait_until_running(&h, rq)) {
1759                 err = __igt_atomic_reset_engine(engine, p, "active");
1760         } else {
1761                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1762                        __func__, engine->name,
1763                        rq->fence.seqno, hws_seqno(&h, rq));
1764                 intel_gt_set_wedged(engine->gt);
1765                 err = -EIO;
1766         }
1767
1768         if (err == 0) {
1769                 struct intel_wedge_me w;
1770
1771                 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1772                         i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1773                 if (intel_gt_is_wedged(engine->gt))
1774                         err = -EIO;
1775         }
1776
1777         i915_request_put(rq);
1778 out:
1779         hang_fini(&h);
1780         return err;
1781 }
1782
1783 static int igt_reset_engines_atomic(void *arg)
1784 {
1785         struct intel_gt *gt = arg;
1786         const typeof(*igt_atomic_phases) *p;
1787         int err = 0;
1788
1789         /* Check that the engines resets are usable from atomic context */
1790
1791         if (!intel_has_reset_engine(gt))
1792                 return 0;
1793
1794         if (intel_uc_uses_guc_submission(&gt->uc))
1795                 return 0;
1796
1797         igt_global_reset_lock(gt);
1798
1799         /* Flush any requests before we get started and check basics */
1800         if (!igt_force_reset(gt))
1801                 goto unlock;
1802
1803         for (p = igt_atomic_phases; p->name; p++) {
1804                 struct intel_engine_cs *engine;
1805                 enum intel_engine_id id;
1806
1807                 for_each_engine(engine, gt, id) {
1808                         err = igt_atomic_reset_engine(engine, p);
1809                         if (err)
1810                                 goto out;
1811                 }
1812         }
1813
1814 out:
1815         /* As we poke around the guts, do a full reset before continuing. */
1816         igt_force_reset(gt);
1817 unlock:
1818         igt_global_reset_unlock(gt);
1819
1820         return err;
1821 }
1822
1823 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1824 {
1825         static const struct i915_subtest tests[] = {
1826                 SUBTEST(igt_hang_sanitycheck),
1827                 SUBTEST(igt_reset_nop),
1828                 SUBTEST(igt_reset_nop_engine),
1829                 SUBTEST(igt_reset_idle_engine),
1830                 SUBTEST(igt_reset_active_engine),
1831                 SUBTEST(igt_reset_fail_engine),
1832                 SUBTEST(igt_reset_engines),
1833                 SUBTEST(igt_reset_engines_atomic),
1834                 SUBTEST(igt_reset_queue),
1835                 SUBTEST(igt_reset_wait),
1836                 SUBTEST(igt_reset_evict_ggtt),
1837                 SUBTEST(igt_reset_evict_ppgtt),
1838                 SUBTEST(igt_reset_evict_fence),
1839                 SUBTEST(igt_handle_error),
1840         };
1841         struct intel_gt *gt = &i915->gt;
1842         intel_wakeref_t wakeref;
1843         int err;
1844
1845         if (!intel_has_gpu_reset(gt))
1846                 return 0;
1847
1848         if (intel_gt_is_wedged(gt))
1849                 return -EIO; /* we're long past hope of a successful reset */
1850
1851         wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1852
1853         err = intel_gt_live_subtests(tests, gt);
1854
1855         intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1856
1857         return err;
1858 }