Merge tag 'drm-intel-next-2019-05-24' of git://anongit.freedesktop.org/drm/drm-intel...
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_reset.c
1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2008-2018 Intel Corporation
5  */
6
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
9
10 #include "i915_drv.h"
11 #include "i915_gpu_error.h"
12 #include "i915_irq.h"
13 #include "intel_engine_pm.h"
14 #include "intel_gt_pm.h"
15 #include "intel_reset.h"
16
17 #include "intel_guc.h"
18 #include "intel_overlay.h"
19
20 #define RESET_MAX_RETRIES 3
21
22 /* XXX How to handle concurrent GGTT updates using tiling registers? */
23 #define RESET_UNDER_STOP_MACHINE 0
24
25 static void rmw_set(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
26 {
27         intel_uncore_rmw(uncore, reg, 0, set);
28 }
29
30 static void rmw_clear(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
31 {
32         intel_uncore_rmw(uncore, reg, clr, 0);
33 }
34
35 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
36 {
37         intel_uncore_rmw_fw(uncore, reg, 0, set);
38 }
39
40 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
41 {
42         intel_uncore_rmw_fw(uncore, reg, clr, 0);
43 }
44
45 static void engine_skip_context(struct i915_request *rq)
46 {
47         struct intel_engine_cs *engine = rq->engine;
48         struct i915_gem_context *hung_ctx = rq->gem_context;
49
50         lockdep_assert_held(&engine->timeline.lock);
51
52         if (!i915_request_is_active(rq))
53                 return;
54
55         list_for_each_entry_continue(rq, &engine->timeline.requests, link)
56                 if (rq->gem_context == hung_ctx)
57                         i915_request_skip(rq, -EIO);
58 }
59
60 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
61                                const struct i915_gem_context *ctx)
62 {
63         unsigned int score;
64         unsigned long prev_hang;
65
66         if (i915_gem_context_is_banned(ctx))
67                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
68         else
69                 score = 0;
70
71         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
72         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
73                 score += I915_CLIENT_SCORE_HANG_FAST;
74
75         if (score) {
76                 atomic_add(score, &file_priv->ban_score);
77
78                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
79                                  ctx->name, score,
80                                  atomic_read(&file_priv->ban_score));
81         }
82 }
83
84 static bool context_mark_guilty(struct i915_gem_context *ctx)
85 {
86         unsigned long prev_hang;
87         bool banned;
88         int i;
89
90         atomic_inc(&ctx->guilty_count);
91
92         /* Cool contexts are too cool to be banned! (Used for reset testing.) */
93         if (!i915_gem_context_is_bannable(ctx))
94                 return false;
95
96         /* Record the timestamp for the last N hangs */
97         prev_hang = ctx->hang_timestamp[0];
98         for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
99                 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
100         ctx->hang_timestamp[i] = jiffies;
101
102         /* If we have hung N+1 times in rapid succession, we ban the context! */
103         banned = !i915_gem_context_is_recoverable(ctx);
104         if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
105                 banned = true;
106         if (banned) {
107                 DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
108                                  ctx->name, atomic_read(&ctx->guilty_count));
109                 i915_gem_context_set_banned(ctx);
110         }
111
112         if (!IS_ERR_OR_NULL(ctx->file_priv))
113                 client_mark_guilty(ctx->file_priv, ctx);
114
115         return banned;
116 }
117
118 static void context_mark_innocent(struct i915_gem_context *ctx)
119 {
120         atomic_inc(&ctx->active_count);
121 }
122
123 void i915_reset_request(struct i915_request *rq, bool guilty)
124 {
125         GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n",
126                   rq->engine->name,
127                   rq->fence.context,
128                   rq->fence.seqno,
129                   yesno(guilty));
130
131         lockdep_assert_held(&rq->engine->timeline.lock);
132         GEM_BUG_ON(i915_request_completed(rq));
133
134         if (guilty) {
135                 i915_request_skip(rq, -EIO);
136                 if (context_mark_guilty(rq->gem_context))
137                         engine_skip_context(rq);
138         } else {
139                 dma_fence_set_error(&rq->fence, -EAGAIN);
140                 context_mark_innocent(rq->gem_context);
141         }
142 }
143
144 static void gen3_stop_engine(struct intel_engine_cs *engine)
145 {
146         struct intel_uncore *uncore = engine->uncore;
147         const u32 base = engine->mmio_base;
148
149         GEM_TRACE("%s\n", engine->name);
150
151         if (intel_engine_stop_cs(engine))
152                 GEM_TRACE("%s: timed out on STOP_RING\n", engine->name);
153
154         intel_uncore_write_fw(uncore,
155                               RING_HEAD(base),
156                               intel_uncore_read_fw(uncore, RING_TAIL(base)));
157         intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
158
159         intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
160         intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
161         intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
162
163         /* The ring must be empty before it is disabled */
164         intel_uncore_write_fw(uncore, RING_CTL(base), 0);
165
166         /* Check acts as a post */
167         if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
168                 GEM_TRACE("%s: ring head [%x] not parked\n",
169                           engine->name,
170                           intel_uncore_read_fw(uncore, RING_HEAD(base)));
171 }
172
173 static void i915_stop_engines(struct drm_i915_private *i915,
174                               intel_engine_mask_t engine_mask)
175 {
176         struct intel_engine_cs *engine;
177         intel_engine_mask_t tmp;
178
179         if (INTEL_GEN(i915) < 3)
180                 return;
181
182         for_each_engine_masked(engine, i915, engine_mask, tmp)
183                 gen3_stop_engine(engine);
184 }
185
186 static bool i915_in_reset(struct pci_dev *pdev)
187 {
188         u8 gdrst;
189
190         pci_read_config_byte(pdev, I915_GDRST, &gdrst);
191         return gdrst & GRDOM_RESET_STATUS;
192 }
193
194 static int i915_do_reset(struct drm_i915_private *i915,
195                          intel_engine_mask_t engine_mask,
196                          unsigned int retry)
197 {
198         struct pci_dev *pdev = i915->drm.pdev;
199         int err;
200
201         /* Assert reset for at least 20 usec, and wait for acknowledgement. */
202         pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
203         udelay(50);
204         err = wait_for_atomic(i915_in_reset(pdev), 50);
205
206         /* Clear the reset request. */
207         pci_write_config_byte(pdev, I915_GDRST, 0);
208         udelay(50);
209         if (!err)
210                 err = wait_for_atomic(!i915_in_reset(pdev), 50);
211
212         return err;
213 }
214
215 static bool g4x_reset_complete(struct pci_dev *pdev)
216 {
217         u8 gdrst;
218
219         pci_read_config_byte(pdev, I915_GDRST, &gdrst);
220         return (gdrst & GRDOM_RESET_ENABLE) == 0;
221 }
222
223 static int g33_do_reset(struct drm_i915_private *i915,
224                         intel_engine_mask_t engine_mask,
225                         unsigned int retry)
226 {
227         struct pci_dev *pdev = i915->drm.pdev;
228
229         pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
230         return wait_for_atomic(g4x_reset_complete(pdev), 50);
231 }
232
233 static int g4x_do_reset(struct drm_i915_private *i915,
234                         intel_engine_mask_t engine_mask,
235                         unsigned int retry)
236 {
237         struct pci_dev *pdev = i915->drm.pdev;
238         struct intel_uncore *uncore = &i915->uncore;
239         int ret;
240
241         /* WaVcpClkGateDisableForMediaReset:ctg,elk */
242         rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
243         intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
244
245         pci_write_config_byte(pdev, I915_GDRST,
246                               GRDOM_MEDIA | GRDOM_RESET_ENABLE);
247         ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
248         if (ret) {
249                 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
250                 goto out;
251         }
252
253         pci_write_config_byte(pdev, I915_GDRST,
254                               GRDOM_RENDER | GRDOM_RESET_ENABLE);
255         ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
256         if (ret) {
257                 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
258                 goto out;
259         }
260
261 out:
262         pci_write_config_byte(pdev, I915_GDRST, 0);
263
264         rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
265         intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
266
267         return ret;
268 }
269
270 static int ironlake_do_reset(struct drm_i915_private *i915,
271                              intel_engine_mask_t engine_mask,
272                              unsigned int retry)
273 {
274         struct intel_uncore *uncore = &i915->uncore;
275         int ret;
276
277         intel_uncore_write_fw(uncore, ILK_GDSR,
278                               ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
279         ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
280                                            ILK_GRDOM_RESET_ENABLE, 0,
281                                            5000, 0,
282                                            NULL);
283         if (ret) {
284                 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
285                 goto out;
286         }
287
288         intel_uncore_write_fw(uncore, ILK_GDSR,
289                               ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
290         ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
291                                            ILK_GRDOM_RESET_ENABLE, 0,
292                                            5000, 0,
293                                            NULL);
294         if (ret) {
295                 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
296                 goto out;
297         }
298
299 out:
300         intel_uncore_write_fw(uncore, ILK_GDSR, 0);
301         intel_uncore_posting_read_fw(uncore, ILK_GDSR);
302         return ret;
303 }
304
305 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
306 static int gen6_hw_domain_reset(struct drm_i915_private *i915,
307                                 u32 hw_domain_mask)
308 {
309         struct intel_uncore *uncore = &i915->uncore;
310         int err;
311
312         /*
313          * GEN6_GDRST is not in the gt power well, no need to check
314          * for fifo space for the write or forcewake the chip for
315          * the read
316          */
317         intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
318
319         /* Wait for the device to ack the reset requests */
320         err = __intel_wait_for_register_fw(uncore,
321                                            GEN6_GDRST, hw_domain_mask, 0,
322                                            500, 0,
323                                            NULL);
324         if (err)
325                 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
326                                  hw_domain_mask);
327
328         return err;
329 }
330
331 static int gen6_reset_engines(struct drm_i915_private *i915,
332                               intel_engine_mask_t engine_mask,
333                               unsigned int retry)
334 {
335         struct intel_engine_cs *engine;
336         const u32 hw_engine_mask[] = {
337                 [RCS0]  = GEN6_GRDOM_RENDER,
338                 [BCS0]  = GEN6_GRDOM_BLT,
339                 [VCS0]  = GEN6_GRDOM_MEDIA,
340                 [VCS1]  = GEN8_GRDOM_MEDIA2,
341                 [VECS0] = GEN6_GRDOM_VECS,
342         };
343         u32 hw_mask;
344
345         if (engine_mask == ALL_ENGINES) {
346                 hw_mask = GEN6_GRDOM_FULL;
347         } else {
348                 intel_engine_mask_t tmp;
349
350                 hw_mask = 0;
351                 for_each_engine_masked(engine, i915, engine_mask, tmp) {
352                         GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
353                         hw_mask |= hw_engine_mask[engine->id];
354                 }
355         }
356
357         return gen6_hw_domain_reset(i915, hw_mask);
358 }
359
360 static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
361 {
362         struct intel_uncore *uncore = engine->uncore;
363         u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
364         i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
365         u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
366         i915_reg_t sfc_usage;
367         u32 sfc_usage_bit;
368         u32 sfc_reset_bit;
369
370         switch (engine->class) {
371         case VIDEO_DECODE_CLASS:
372                 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
373                         return 0;
374
375                 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
376                 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
377
378                 sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
379                 sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
380
381                 sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
382                 sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
383                 sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
384                 break;
385
386         case VIDEO_ENHANCEMENT_CLASS:
387                 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
388                 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
389
390                 sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
391                 sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
392
393                 sfc_usage = GEN11_VECS_SFC_USAGE(engine);
394                 sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
395                 sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
396                 break;
397
398         default:
399                 return 0;
400         }
401
402         /*
403          * Tell the engine that a software reset is going to happen. The engine
404          * will then try to force lock the SFC (if currently locked, it will
405          * remain so until we tell the engine it is safe to unlock; if currently
406          * unlocked, it will ignore this and all new lock requests). If SFC
407          * ends up being locked to the engine we want to reset, we have to reset
408          * it as well (we will unlock it once the reset sequence is completed).
409          */
410         rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
411
412         if (__intel_wait_for_register_fw(uncore,
413                                          sfc_forced_lock_ack,
414                                          sfc_forced_lock_ack_bit,
415                                          sfc_forced_lock_ack_bit,
416                                          1000, 0, NULL)) {
417                 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
418                 return 0;
419         }
420
421         if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
422                 return sfc_reset_bit;
423
424         return 0;
425 }
426
427 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
428 {
429         struct intel_uncore *uncore = engine->uncore;
430         u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
431         i915_reg_t sfc_forced_lock;
432         u32 sfc_forced_lock_bit;
433
434         switch (engine->class) {
435         case VIDEO_DECODE_CLASS:
436                 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
437                         return;
438
439                 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
440                 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
441                 break;
442
443         case VIDEO_ENHANCEMENT_CLASS:
444                 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
445                 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
446                 break;
447
448         default:
449                 return;
450         }
451
452         rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
453 }
454
455 static int gen11_reset_engines(struct drm_i915_private *i915,
456                                intel_engine_mask_t engine_mask,
457                                unsigned int retry)
458 {
459         const u32 hw_engine_mask[] = {
460                 [RCS0]  = GEN11_GRDOM_RENDER,
461                 [BCS0]  = GEN11_GRDOM_BLT,
462                 [VCS0]  = GEN11_GRDOM_MEDIA,
463                 [VCS1]  = GEN11_GRDOM_MEDIA2,
464                 [VCS2]  = GEN11_GRDOM_MEDIA3,
465                 [VCS3]  = GEN11_GRDOM_MEDIA4,
466                 [VECS0] = GEN11_GRDOM_VECS,
467                 [VECS1] = GEN11_GRDOM_VECS2,
468         };
469         struct intel_engine_cs *engine;
470         intel_engine_mask_t tmp;
471         u32 hw_mask;
472         int ret;
473
474         if (engine_mask == ALL_ENGINES) {
475                 hw_mask = GEN11_GRDOM_FULL;
476         } else {
477                 hw_mask = 0;
478                 for_each_engine_masked(engine, i915, engine_mask, tmp) {
479                         GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
480                         hw_mask |= hw_engine_mask[engine->id];
481                         hw_mask |= gen11_lock_sfc(engine);
482                 }
483         }
484
485         ret = gen6_hw_domain_reset(i915, hw_mask);
486
487         if (engine_mask != ALL_ENGINES)
488                 for_each_engine_masked(engine, i915, engine_mask, tmp)
489                         gen11_unlock_sfc(engine);
490
491         return ret;
492 }
493
494 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
495 {
496         struct intel_uncore *uncore = engine->uncore;
497         const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
498         u32 request, mask, ack;
499         int ret;
500
501         ack = intel_uncore_read_fw(uncore, reg);
502         if (ack & RESET_CTL_CAT_ERROR) {
503                 /*
504                  * For catastrophic errors, ready-for-reset sequence
505                  * needs to be bypassed: HAS#396813
506                  */
507                 request = RESET_CTL_CAT_ERROR;
508                 mask = RESET_CTL_CAT_ERROR;
509
510                 /* Catastrophic errors need to be cleared by HW */
511                 ack = 0;
512         } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
513                 request = RESET_CTL_REQUEST_RESET;
514                 mask = RESET_CTL_READY_TO_RESET;
515                 ack = RESET_CTL_READY_TO_RESET;
516         } else {
517                 return 0;
518         }
519
520         intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
521         ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
522                                            700, 0, NULL);
523         if (ret)
524                 DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
525                           engine->name, request,
526                           intel_uncore_read_fw(uncore, reg));
527
528         return ret;
529 }
530
531 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
532 {
533         intel_uncore_write_fw(engine->uncore,
534                               RING_RESET_CTL(engine->mmio_base),
535                               _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
536 }
537
538 static int gen8_reset_engines(struct drm_i915_private *i915,
539                               intel_engine_mask_t engine_mask,
540                               unsigned int retry)
541 {
542         struct intel_engine_cs *engine;
543         const bool reset_non_ready = retry >= 1;
544         intel_engine_mask_t tmp;
545         int ret;
546
547         for_each_engine_masked(engine, i915, engine_mask, tmp) {
548                 ret = gen8_engine_reset_prepare(engine);
549                 if (ret && !reset_non_ready)
550                         goto skip_reset;
551
552                 /*
553                  * If this is not the first failed attempt to prepare,
554                  * we decide to proceed anyway.
555                  *
556                  * By doing so we risk context corruption and with
557                  * some gens (kbl), possible system hang if reset
558                  * happens during active bb execution.
559                  *
560                  * We rather take context corruption instead of
561                  * failed reset with a wedged driver/gpu. And
562                  * active bb execution case should be covered by
563                  * i915_stop_engines we have before the reset.
564                  */
565         }
566
567         if (INTEL_GEN(i915) >= 11)
568                 ret = gen11_reset_engines(i915, engine_mask, retry);
569         else
570                 ret = gen6_reset_engines(i915, engine_mask, retry);
571
572 skip_reset:
573         for_each_engine_masked(engine, i915, engine_mask, tmp)
574                 gen8_engine_reset_cancel(engine);
575
576         return ret;
577 }
578
579 typedef int (*reset_func)(struct drm_i915_private *,
580                           intel_engine_mask_t engine_mask,
581                           unsigned int retry);
582
583 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
584 {
585         if (INTEL_GEN(i915) >= 8)
586                 return gen8_reset_engines;
587         else if (INTEL_GEN(i915) >= 6)
588                 return gen6_reset_engines;
589         else if (INTEL_GEN(i915) >= 5)
590                 return ironlake_do_reset;
591         else if (IS_G4X(i915))
592                 return g4x_do_reset;
593         else if (IS_G33(i915) || IS_PINEVIEW(i915))
594                 return g33_do_reset;
595         else if (INTEL_GEN(i915) >= 3)
596                 return i915_do_reset;
597         else
598                 return NULL;
599 }
600
601 int intel_gpu_reset(struct drm_i915_private *i915,
602                     intel_engine_mask_t engine_mask)
603 {
604         const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
605         reset_func reset;
606         int ret = -ETIMEDOUT;
607         int retry;
608
609         reset = intel_get_gpu_reset(i915);
610         if (!reset)
611                 return -ENODEV;
612
613         /*
614          * If the power well sleeps during the reset, the reset
615          * request may be dropped and never completes (causing -EIO).
616          */
617         intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
618         for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
619                 /*
620                  * We stop engines, otherwise we might get failed reset and a
621                  * dead gpu (on elk). Also as modern gpu as kbl can suffer
622                  * from system hang if batchbuffer is progressing when
623                  * the reset is issued, regardless of READY_TO_RESET ack.
624                  * Thus assume it is best to stop engines on all gens
625                  * where we have a gpu reset.
626                  *
627                  * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
628                  *
629                  * WaMediaResetMainRingCleanup:ctg,elk (presumably)
630                  *
631                  * FIXME: Wa for more modern gens needs to be validated
632                  */
633                 if (retry)
634                         i915_stop_engines(i915, engine_mask);
635
636                 GEM_TRACE("engine_mask=%x\n", engine_mask);
637                 preempt_disable();
638                 ret = reset(i915, engine_mask, retry);
639                 preempt_enable();
640         }
641         intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
642
643         return ret;
644 }
645
646 bool intel_has_gpu_reset(struct drm_i915_private *i915)
647 {
648         if (!i915_modparams.reset)
649                 return NULL;
650
651         return intel_get_gpu_reset(i915);
652 }
653
654 bool intel_has_reset_engine(struct drm_i915_private *i915)
655 {
656         return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
657 }
658
659 int intel_reset_guc(struct drm_i915_private *i915)
660 {
661         u32 guc_domain =
662                 INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
663         int ret;
664
665         GEM_BUG_ON(!HAS_GUC(i915));
666
667         intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
668         ret = gen6_hw_domain_reset(i915, guc_domain);
669         intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
670
671         return ret;
672 }
673
674 /*
675  * Ensure irq handler finishes, and not run again.
676  * Also return the active request so that we only search for it once.
677  */
678 static void reset_prepare_engine(struct intel_engine_cs *engine)
679 {
680         /*
681          * During the reset sequence, we must prevent the engine from
682          * entering RC6. As the context state is undefined until we restart
683          * the engine, if it does enter RC6 during the reset, the state
684          * written to the powercontext is undefined and so we may lose
685          * GPU state upon resume, i.e. fail to restart after a reset.
686          */
687         intel_engine_pm_get(engine);
688         intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
689         engine->reset.prepare(engine);
690 }
691
692 static void revoke_mmaps(struct drm_i915_private *i915)
693 {
694         int i;
695
696         for (i = 0; i < i915->num_fence_regs; i++) {
697                 struct drm_vma_offset_node *node;
698                 struct i915_vma *vma;
699                 u64 vma_offset;
700
701                 vma = READ_ONCE(i915->fence_regs[i].vma);
702                 if (!vma)
703                         continue;
704
705                 if (!i915_vma_has_userfault(vma))
706                         continue;
707
708                 GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
709                 node = &vma->obj->base.vma_node;
710                 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
711                 unmap_mapping_range(i915->drm.anon_inode->i_mapping,
712                                     drm_vma_node_offset_addr(node) + vma_offset,
713                                     vma->size,
714                                     1);
715         }
716 }
717
718 static void reset_prepare(struct drm_i915_private *i915)
719 {
720         struct intel_engine_cs *engine;
721         enum intel_engine_id id;
722
723         intel_gt_pm_get(i915);
724         for_each_engine(engine, i915, id)
725                 reset_prepare_engine(engine);
726
727         intel_uc_reset_prepare(i915);
728 }
729
730 static void gt_revoke(struct drm_i915_private *i915)
731 {
732         revoke_mmaps(i915);
733 }
734
735 static int gt_reset(struct drm_i915_private *i915,
736                     intel_engine_mask_t stalled_mask)
737 {
738         struct intel_engine_cs *engine;
739         enum intel_engine_id id;
740         int err;
741
742         /*
743          * Everything depends on having the GTT running, so we need to start
744          * there.
745          */
746         err = i915_ggtt_enable_hw(i915);
747         if (err)
748                 return err;
749
750         for_each_engine(engine, i915, id)
751                 intel_engine_reset(engine, stalled_mask & engine->mask);
752
753         i915_gem_restore_fences(i915);
754
755         return err;
756 }
757
758 static void reset_finish_engine(struct intel_engine_cs *engine)
759 {
760         engine->reset.finish(engine);
761         intel_engine_pm_put(engine);
762         intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
763 }
764
765 static void reset_finish(struct drm_i915_private *i915)
766 {
767         struct intel_engine_cs *engine;
768         enum intel_engine_id id;
769
770         for_each_engine(engine, i915, id) {
771                 reset_finish_engine(engine);
772                 intel_engine_signal_breadcrumbs(engine);
773         }
774         intel_gt_pm_put(i915);
775 }
776
777 static void nop_submit_request(struct i915_request *request)
778 {
779         struct intel_engine_cs *engine = request->engine;
780         unsigned long flags;
781
782         GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
783                   engine->name, request->fence.context, request->fence.seqno);
784         dma_fence_set_error(&request->fence, -EIO);
785
786         spin_lock_irqsave(&engine->timeline.lock, flags);
787         __i915_request_submit(request);
788         i915_request_mark_complete(request);
789         spin_unlock_irqrestore(&engine->timeline.lock, flags);
790
791         intel_engine_queue_breadcrumbs(engine);
792 }
793
794 static void __i915_gem_set_wedged(struct drm_i915_private *i915)
795 {
796         struct i915_gpu_error *error = &i915->gpu_error;
797         struct intel_engine_cs *engine;
798         enum intel_engine_id id;
799
800         if (test_bit(I915_WEDGED, &error->flags))
801                 return;
802
803         if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
804                 struct drm_printer p = drm_debug_printer(__func__);
805
806                 for_each_engine(engine, i915, id)
807                         intel_engine_dump(engine, &p, "%s\n", engine->name);
808         }
809
810         GEM_TRACE("start\n");
811
812         /*
813          * First, stop submission to hw, but do not yet complete requests by
814          * rolling the global seqno forward (since this would complete requests
815          * for which we haven't set the fence error to EIO yet).
816          */
817         reset_prepare(i915);
818
819         /* Even if the GPU reset fails, it should still stop the engines */
820         if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
821                 intel_gpu_reset(i915, ALL_ENGINES);
822
823         for_each_engine(engine, i915, id) {
824                 engine->submit_request = nop_submit_request;
825                 engine->schedule = NULL;
826         }
827         i915->caps.scheduler = 0;
828
829         /*
830          * Make sure no request can slip through without getting completed by
831          * either this call here to intel_engine_write_global_seqno, or the one
832          * in nop_submit_request.
833          */
834         synchronize_rcu_expedited();
835         set_bit(I915_WEDGED, &error->flags);
836
837         /* Mark all executing requests as skipped */
838         for_each_engine(engine, i915, id)
839                 engine->cancel_requests(engine);
840
841         reset_finish(i915);
842
843         GEM_TRACE("end\n");
844 }
845
846 void i915_gem_set_wedged(struct drm_i915_private *i915)
847 {
848         struct i915_gpu_error *error = &i915->gpu_error;
849         intel_wakeref_t wakeref;
850
851         mutex_lock(&error->wedge_mutex);
852         with_intel_runtime_pm(i915, wakeref)
853                 __i915_gem_set_wedged(i915);
854         mutex_unlock(&error->wedge_mutex);
855 }
856
857 static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
858 {
859         struct i915_gpu_error *error = &i915->gpu_error;
860         struct i915_timeline *tl;
861
862         if (!test_bit(I915_WEDGED, &error->flags))
863                 return true;
864
865         if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
866                 return false;
867
868         GEM_TRACE("start\n");
869
870         /*
871          * Before unwedging, make sure that all pending operations
872          * are flushed and errored out - we may have requests waiting upon
873          * third party fences. We marked all inflight requests as EIO, and
874          * every execbuf since returned EIO, for consistency we want all
875          * the currently pending requests to also be marked as EIO, which
876          * is done inside our nop_submit_request - and so we must wait.
877          *
878          * No more can be submitted until we reset the wedged bit.
879          */
880         mutex_lock(&i915->gt.timelines.mutex);
881         list_for_each_entry(tl, &i915->gt.timelines.active_list, link) {
882                 struct i915_request *rq;
883
884                 rq = i915_active_request_get_unlocked(&tl->last_request);
885                 if (!rq)
886                         continue;
887
888                 /*
889                  * All internal dependencies (i915_requests) will have
890                  * been flushed by the set-wedge, but we may be stuck waiting
891                  * for external fences. These should all be capped to 10s
892                  * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
893                  * in the worst case.
894                  */
895                 dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
896                 i915_request_put(rq);
897         }
898         mutex_unlock(&i915->gt.timelines.mutex);
899
900         intel_gt_sanitize(i915, false);
901
902         /*
903          * Undo nop_submit_request. We prevent all new i915 requests from
904          * being queued (by disallowing execbuf whilst wedged) so having
905          * waited for all active requests above, we know the system is idle
906          * and do not have to worry about a thread being inside
907          * engine->submit_request() as we swap over. So unlike installing
908          * the nop_submit_request on reset, we can do this from normal
909          * context and do not require stop_machine().
910          */
911         intel_engines_reset_default_submission(i915);
912
913         GEM_TRACE("end\n");
914
915         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
916         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
917
918         return true;
919 }
920
921 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
922 {
923         struct i915_gpu_error *error = &i915->gpu_error;
924         bool result;
925
926         mutex_lock(&error->wedge_mutex);
927         result = __i915_gem_unset_wedged(i915);
928         mutex_unlock(&error->wedge_mutex);
929
930         return result;
931 }
932
933 static int do_reset(struct drm_i915_private *i915,
934                     intel_engine_mask_t stalled_mask)
935 {
936         int err, i;
937
938         gt_revoke(i915);
939
940         err = intel_gpu_reset(i915, ALL_ENGINES);
941         for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
942                 msleep(10 * (i + 1));
943                 err = intel_gpu_reset(i915, ALL_ENGINES);
944         }
945         if (err)
946                 return err;
947
948         return gt_reset(i915, stalled_mask);
949 }
950
951 /**
952  * i915_reset - reset chip after a hang
953  * @i915: #drm_i915_private to reset
954  * @stalled_mask: mask of the stalled engines with the guilty requests
955  * @reason: user error message for why we are resetting
956  *
957  * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
958  * on failure.
959  *
960  * Procedure is fairly simple:
961  *   - reset the chip using the reset reg
962  *   - re-init context state
963  *   - re-init hardware status page
964  *   - re-init ring buffer
965  *   - re-init interrupt state
966  *   - re-init display
967  */
968 void i915_reset(struct drm_i915_private *i915,
969                 intel_engine_mask_t stalled_mask,
970                 const char *reason)
971 {
972         struct i915_gpu_error *error = &i915->gpu_error;
973         int ret;
974
975         GEM_TRACE("flags=%lx\n", error->flags);
976
977         might_sleep();
978         GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
979
980         /* Clear any previous failed attempts at recovery. Time to try again. */
981         if (!__i915_gem_unset_wedged(i915))
982                 return;
983
984         if (reason)
985                 dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
986         error->reset_count++;
987
988         reset_prepare(i915);
989
990         if (!intel_has_gpu_reset(i915)) {
991                 if (i915_modparams.reset)
992                         dev_err(i915->drm.dev, "GPU reset not supported\n");
993                 else
994                         DRM_DEBUG_DRIVER("GPU reset disabled\n");
995                 goto error;
996         }
997
998         if (INTEL_INFO(i915)->gpu_reset_clobbers_display)
999                 intel_runtime_pm_disable_interrupts(i915);
1000
1001         if (do_reset(i915, stalled_mask)) {
1002                 dev_err(i915->drm.dev, "Failed to reset chip\n");
1003                 goto taint;
1004         }
1005
1006         if (INTEL_INFO(i915)->gpu_reset_clobbers_display)
1007                 intel_runtime_pm_enable_interrupts(i915);
1008
1009         intel_overlay_reset(i915);
1010
1011         /*
1012          * Next we need to restore the context, but we don't use those
1013          * yet either...
1014          *
1015          * Ring buffer needs to be re-initialized in the KMS case, or if X
1016          * was running at the time of the reset (i.e. we weren't VT
1017          * switched away).
1018          */
1019         ret = i915_gem_init_hw(i915);
1020         if (ret) {
1021                 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
1022                           ret);
1023                 goto error;
1024         }
1025
1026         i915_queue_hangcheck(i915);
1027
1028 finish:
1029         reset_finish(i915);
1030         return;
1031
1032 taint:
1033         /*
1034          * History tells us that if we cannot reset the GPU now, we
1035          * never will. This then impacts everything that is run
1036          * subsequently. On failing the reset, we mark the driver
1037          * as wedged, preventing further execution on the GPU.
1038          * We also want to go one step further and add a taint to the
1039          * kernel so that any subsequent faults can be traced back to
1040          * this failure. This is important for CI, where if the
1041          * GPU/driver fails we would like to reboot and restart testing
1042          * rather than continue on into oblivion. For everyone else,
1043          * the system should still plod along, but they have been warned!
1044          */
1045         add_taint_for_CI(TAINT_WARN);
1046 error:
1047         __i915_gem_set_wedged(i915);
1048         goto finish;
1049 }
1050
1051 static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
1052                                         struct intel_engine_cs *engine)
1053 {
1054         return intel_gpu_reset(i915, engine->mask);
1055 }
1056
1057 /**
1058  * i915_reset_engine - reset GPU engine to recover from a hang
1059  * @engine: engine to reset
1060  * @msg: reason for GPU reset; or NULL for no dev_notice()
1061  *
1062  * Reset a specific GPU engine. Useful if a hang is detected.
1063  * Returns zero on successful reset or otherwise an error code.
1064  *
1065  * Procedure is:
1066  *  - identifies the request that caused the hang and it is dropped
1067  *  - reset engine (which will force the engine to idle)
1068  *  - re-init/configure engine
1069  */
1070 int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
1071 {
1072         struct i915_gpu_error *error = &engine->i915->gpu_error;
1073         int ret;
1074
1075         GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
1076         GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
1077
1078         if (!intel_wakeref_active(&engine->wakeref))
1079                 return 0;
1080
1081         reset_prepare_engine(engine);
1082
1083         if (msg)
1084                 dev_notice(engine->i915->drm.dev,
1085                            "Resetting %s for %s\n", engine->name, msg);
1086         error->reset_engine_count[engine->id]++;
1087
1088         if (!engine->i915->guc.execbuf_client)
1089                 ret = intel_gt_reset_engine(engine->i915, engine);
1090         else
1091                 ret = intel_guc_reset_engine(&engine->i915->guc, engine);
1092         if (ret) {
1093                 /* If we fail here, we expect to fallback to a global reset */
1094                 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1095                                  engine->i915->guc.execbuf_client ? "GuC " : "",
1096                                  engine->name, ret);
1097                 goto out;
1098         }
1099
1100         /*
1101          * The request that caused the hang is stuck on elsp, we know the
1102          * active request and can drop it, adjust head to skip the offending
1103          * request to resume executing remaining requests in the queue.
1104          */
1105         intel_engine_reset(engine, true);
1106
1107         /*
1108          * The engine and its registers (and workarounds in case of render)
1109          * have been reset to their default values. Follow the init_ring
1110          * process to program RING_MODE, HWSP and re-enable submission.
1111          */
1112         ret = engine->resume(engine);
1113         if (ret)
1114                 goto out;
1115
1116 out:
1117         intel_engine_cancel_stop_cs(engine);
1118         reset_finish_engine(engine);
1119         return ret;
1120 }
1121
1122 static void i915_reset_device(struct drm_i915_private *i915,
1123                               u32 engine_mask,
1124                               const char *reason)
1125 {
1126         struct i915_gpu_error *error = &i915->gpu_error;
1127         struct kobject *kobj = &i915->drm.primary->kdev->kobj;
1128         char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1129         char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1130         char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1131         struct i915_wedge_me w;
1132
1133         kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1134
1135         DRM_DEBUG_DRIVER("resetting chip\n");
1136         kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1137
1138         /* Use a watchdog to ensure that our reset completes */
1139         i915_wedge_on_timeout(&w, i915, 5 * HZ) {
1140                 intel_prepare_reset(i915);
1141
1142                 /* Flush everyone using a resource about to be clobbered */
1143                 synchronize_srcu_expedited(&error->reset_backoff_srcu);
1144
1145                 mutex_lock(&error->wedge_mutex);
1146                 i915_reset(i915, engine_mask, reason);
1147                 mutex_unlock(&error->wedge_mutex);
1148
1149                 intel_finish_reset(i915);
1150         }
1151
1152         if (!test_bit(I915_WEDGED, &error->flags))
1153                 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1154 }
1155
1156 static void clear_register(struct intel_uncore *uncore, i915_reg_t reg)
1157 {
1158         intel_uncore_rmw(uncore, reg, 0, 0);
1159 }
1160
1161 void i915_clear_error_registers(struct drm_i915_private *i915)
1162 {
1163         struct intel_uncore *uncore = &i915->uncore;
1164         u32 eir;
1165
1166         if (!IS_GEN(i915, 2))
1167                 clear_register(uncore, PGTBL_ER);
1168
1169         if (INTEL_GEN(i915) < 4)
1170                 clear_register(uncore, IPEIR(RENDER_RING_BASE));
1171         else
1172                 clear_register(uncore, IPEIR_I965);
1173
1174         clear_register(uncore, EIR);
1175         eir = intel_uncore_read(uncore, EIR);
1176         if (eir) {
1177                 /*
1178                  * some errors might have become stuck,
1179                  * mask them.
1180                  */
1181                 DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
1182                 rmw_set(uncore, EMR, eir);
1183                 intel_uncore_write(uncore, GEN2_IIR,
1184                                    I915_MASTER_ERROR_INTERRUPT);
1185         }
1186
1187         if (INTEL_GEN(i915) >= 8) {
1188                 rmw_clear(uncore, GEN8_RING_FAULT_REG, RING_FAULT_VALID);
1189                 intel_uncore_posting_read(uncore, GEN8_RING_FAULT_REG);
1190         } else if (INTEL_GEN(i915) >= 6) {
1191                 struct intel_engine_cs *engine;
1192                 enum intel_engine_id id;
1193
1194                 for_each_engine(engine, i915, id) {
1195                         rmw_clear(uncore,
1196                                   RING_FAULT_REG(engine), RING_FAULT_VALID);
1197                         intel_uncore_posting_read(uncore,
1198                                                   RING_FAULT_REG(engine));
1199                 }
1200         }
1201 }
1202
1203 /**
1204  * i915_handle_error - handle a gpu error
1205  * @i915: i915 device private
1206  * @engine_mask: mask representing engines that are hung
1207  * @flags: control flags
1208  * @fmt: Error message format string
1209  *
1210  * Do some basic checking of register state at error time and
1211  * dump it to the syslog.  Also call i915_capture_error_state() to make
1212  * sure we get a record and make it available in debugfs.  Fire a uevent
1213  * so userspace knows something bad happened (should trigger collection
1214  * of a ring dump etc.).
1215  */
1216 void i915_handle_error(struct drm_i915_private *i915,
1217                        intel_engine_mask_t engine_mask,
1218                        unsigned long flags,
1219                        const char *fmt, ...)
1220 {
1221         struct i915_gpu_error *error = &i915->gpu_error;
1222         struct intel_engine_cs *engine;
1223         intel_wakeref_t wakeref;
1224         intel_engine_mask_t tmp;
1225         char error_msg[80];
1226         char *msg = NULL;
1227
1228         if (fmt) {
1229                 va_list args;
1230
1231                 va_start(args, fmt);
1232                 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1233                 va_end(args);
1234
1235                 msg = error_msg;
1236         }
1237
1238         /*
1239          * In most cases it's guaranteed that we get here with an RPM
1240          * reference held, for example because there is a pending GPU
1241          * request that won't finish until the reset is done. This
1242          * isn't the case at least when we get here by doing a
1243          * simulated reset via debugfs, so get an RPM reference.
1244          */
1245         wakeref = intel_runtime_pm_get(i915);
1246
1247         engine_mask &= INTEL_INFO(i915)->engine_mask;
1248
1249         if (flags & I915_ERROR_CAPTURE) {
1250                 i915_capture_error_state(i915, engine_mask, msg);
1251                 i915_clear_error_registers(i915);
1252         }
1253
1254         /*
1255          * Try engine reset when available. We fall back to full reset if
1256          * single reset fails.
1257          */
1258         if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
1259                 for_each_engine_masked(engine, i915, engine_mask, tmp) {
1260                         BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1261                         if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1262                                              &error->flags))
1263                                 continue;
1264
1265                         if (i915_reset_engine(engine, msg) == 0)
1266                                 engine_mask &= ~engine->mask;
1267
1268                         clear_bit(I915_RESET_ENGINE + engine->id,
1269                                   &error->flags);
1270                         wake_up_bit(&error->flags,
1271                                     I915_RESET_ENGINE + engine->id);
1272                 }
1273         }
1274
1275         if (!engine_mask)
1276                 goto out;
1277
1278         /* Full reset needs the mutex, stop any other user trying to do so. */
1279         if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
1280                 wait_event(error->reset_queue,
1281                            !test_bit(I915_RESET_BACKOFF, &error->flags));
1282                 goto out; /* piggy-back on the other reset */
1283         }
1284
1285         /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1286         synchronize_rcu_expedited();
1287
1288         /* Prevent any other reset-engine attempt. */
1289         for_each_engine(engine, i915, tmp) {
1290                 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1291                                         &error->flags))
1292                         wait_on_bit(&error->flags,
1293                                     I915_RESET_ENGINE + engine->id,
1294                                     TASK_UNINTERRUPTIBLE);
1295         }
1296
1297         i915_reset_device(i915, engine_mask, msg);
1298
1299         for_each_engine(engine, i915, tmp) {
1300                 clear_bit(I915_RESET_ENGINE + engine->id,
1301                           &error->flags);
1302         }
1303
1304         clear_bit(I915_RESET_BACKOFF, &error->flags);
1305         wake_up_all(&error->reset_queue);
1306
1307 out:
1308         intel_runtime_pm_put(i915, wakeref);
1309 }
1310
1311 int i915_reset_trylock(struct drm_i915_private *i915)
1312 {
1313         struct i915_gpu_error *error = &i915->gpu_error;
1314         int srcu;
1315
1316         might_lock(&error->reset_backoff_srcu);
1317         might_sleep();
1318
1319         rcu_read_lock();
1320         while (test_bit(I915_RESET_BACKOFF, &error->flags)) {
1321                 rcu_read_unlock();
1322
1323                 if (wait_event_interruptible(error->reset_queue,
1324                                              !test_bit(I915_RESET_BACKOFF,
1325                                                        &error->flags)))
1326                         return -EINTR;
1327
1328                 rcu_read_lock();
1329         }
1330         srcu = srcu_read_lock(&error->reset_backoff_srcu);
1331         rcu_read_unlock();
1332
1333         return srcu;
1334 }
1335
1336 void i915_reset_unlock(struct drm_i915_private *i915, int tag)
1337 __releases(&i915->gpu_error.reset_backoff_srcu)
1338 {
1339         struct i915_gpu_error *error = &i915->gpu_error;
1340
1341         srcu_read_unlock(&error->reset_backoff_srcu, tag);
1342 }
1343
1344 int i915_terminally_wedged(struct drm_i915_private *i915)
1345 {
1346         struct i915_gpu_error *error = &i915->gpu_error;
1347
1348         might_sleep();
1349
1350         if (!__i915_wedged(error))
1351                 return 0;
1352
1353         /* Reset still in progress? Maybe we will recover? */
1354         if (!test_bit(I915_RESET_BACKOFF, &error->flags))
1355                 return -EIO;
1356
1357         /* XXX intel_reset_finish() still takes struct_mutex!!! */
1358         if (mutex_is_locked(&i915->drm.struct_mutex))
1359                 return -EAGAIN;
1360
1361         if (wait_event_interruptible(error->reset_queue,
1362                                      !test_bit(I915_RESET_BACKOFF,
1363                                                &error->flags)))
1364                 return -EINTR;
1365
1366         return __i915_wedged(error) ? -EIO : 0;
1367 }
1368
1369 static void i915_wedge_me(struct work_struct *work)
1370 {
1371         struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
1372
1373         dev_err(w->i915->drm.dev,
1374                 "%s timed out, cancelling all in-flight rendering.\n",
1375                 w->name);
1376         i915_gem_set_wedged(w->i915);
1377 }
1378
1379 void __i915_init_wedge(struct i915_wedge_me *w,
1380                        struct drm_i915_private *i915,
1381                        long timeout,
1382                        const char *name)
1383 {
1384         w->i915 = i915;
1385         w->name = name;
1386
1387         INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me);
1388         schedule_delayed_work(&w->work, timeout);
1389 }
1390
1391 void __i915_fini_wedge(struct i915_wedge_me *w)
1392 {
1393         cancel_delayed_work_sync(&w->work);
1394         destroy_delayed_work_on_stack(&w->work);
1395         w->i915 = NULL;
1396 }
1397
1398 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1399 #include "selftest_reset.c"
1400 #endif