Merge tag 'drm-intel-next-2019-05-24' of git://anongit.freedesktop.org/drm/drm-intel...
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / i915_gpu_error.c
1 /*
2  * Copyright (c) 2008 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *    Keith Packard <keithp@keithp.com>
26  *    Mika Kuoppala <mika.kuoppala@intel.com>
27  *
28  */
29
30 #include <linux/ascii85.h>
31 #include <linux/nmi.h>
32 #include <linux/scatterlist.h>
33 #include <linux/stop_machine.h>
34 #include <linux/utsname.h>
35 #include <linux/zlib.h>
36
37 #include <drm/drm_print.h>
38
39 #include "i915_drv.h"
40 #include "i915_gpu_error.h"
41 #include "intel_atomic.h"
42 #include "intel_csr.h"
43 #include "intel_overlay.h"
44
45 static inline const struct intel_engine_cs *
46 engine_lookup(const struct drm_i915_private *i915, unsigned int id)
47 {
48         if (id >= I915_NUM_ENGINES)
49                 return NULL;
50
51         return i915->engine[id];
52 }
53
54 static inline const char *
55 __engine_name(const struct intel_engine_cs *engine)
56 {
57         return engine ? engine->name : "";
58 }
59
60 static const char *
61 engine_name(const struct drm_i915_private *i915, unsigned int id)
62 {
63         return __engine_name(engine_lookup(i915, id));
64 }
65
66 static const char *tiling_flag(int tiling)
67 {
68         switch (tiling) {
69         default:
70         case I915_TILING_NONE: return "";
71         case I915_TILING_X: return " X";
72         case I915_TILING_Y: return " Y";
73         }
74 }
75
76 static const char *dirty_flag(int dirty)
77 {
78         return dirty ? " dirty" : "";
79 }
80
81 static const char *purgeable_flag(int purgeable)
82 {
83         return purgeable ? " purgeable" : "";
84 }
85
86 static void __sg_set_buf(struct scatterlist *sg,
87                          void *addr, unsigned int len, loff_t it)
88 {
89         sg->page_link = (unsigned long)virt_to_page(addr);
90         sg->offset = offset_in_page(addr);
91         sg->length = len;
92         sg->dma_address = it;
93 }
94
95 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
96 {
97         if (!len)
98                 return false;
99
100         if (e->bytes + len + 1 <= e->size)
101                 return true;
102
103         if (e->bytes) {
104                 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
105                 e->iter += e->bytes;
106                 e->buf = NULL;
107                 e->bytes = 0;
108         }
109
110         if (e->cur == e->end) {
111                 struct scatterlist *sgl;
112
113                 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
114                 if (!sgl) {
115                         e->err = -ENOMEM;
116                         return false;
117                 }
118
119                 if (e->cur) {
120                         e->cur->offset = 0;
121                         e->cur->length = 0;
122                         e->cur->page_link =
123                                 (unsigned long)sgl | SG_CHAIN;
124                 } else {
125                         e->sgl = sgl;
126                 }
127
128                 e->cur = sgl;
129                 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
130         }
131
132         e->size = ALIGN(len + 1, SZ_64K);
133         e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
134         if (!e->buf) {
135                 e->size = PAGE_ALIGN(len + 1);
136                 e->buf = kmalloc(e->size, GFP_KERNEL);
137         }
138         if (!e->buf) {
139                 e->err = -ENOMEM;
140                 return false;
141         }
142
143         return true;
144 }
145
146 __printf(2, 0)
147 static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
148                                const char *fmt, va_list args)
149 {
150         va_list ap;
151         int len;
152
153         if (e->err)
154                 return;
155
156         va_copy(ap, args);
157         len = vsnprintf(NULL, 0, fmt, ap);
158         va_end(ap);
159         if (len <= 0) {
160                 e->err = len;
161                 return;
162         }
163
164         if (!__i915_error_grow(e, len))
165                 return;
166
167         GEM_BUG_ON(e->bytes >= e->size);
168         len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
169         if (len < 0) {
170                 e->err = len;
171                 return;
172         }
173         e->bytes += len;
174 }
175
176 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
177 {
178         unsigned len;
179
180         if (e->err || !str)
181                 return;
182
183         len = strlen(str);
184         if (!__i915_error_grow(e, len))
185                 return;
186
187         GEM_BUG_ON(e->bytes + len > e->size);
188         memcpy(e->buf + e->bytes, str, len);
189         e->bytes += len;
190 }
191
192 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
193 #define err_puts(e, s) i915_error_puts(e, s)
194
195 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
196 {
197         i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
198 }
199
200 static inline struct drm_printer
201 i915_error_printer(struct drm_i915_error_state_buf *e)
202 {
203         struct drm_printer p = {
204                 .printfn = __i915_printfn_error,
205                 .arg = e,
206         };
207         return p;
208 }
209
210 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
211
212 struct compress {
213         struct z_stream_s zstream;
214         void *tmp;
215 };
216
217 static bool compress_init(struct compress *c)
218 {
219         struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
220
221         zstream->workspace =
222                 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
223                         GFP_ATOMIC | __GFP_NOWARN);
224         if (!zstream->workspace)
225                 return false;
226
227         if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
228                 kfree(zstream->workspace);
229                 return false;
230         }
231
232         c->tmp = NULL;
233         if (i915_has_memcpy_from_wc())
234                 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
235
236         return true;
237 }
238
239 static void *compress_next_page(struct drm_i915_error_object *dst)
240 {
241         unsigned long page;
242
243         if (dst->page_count >= dst->num_pages)
244                 return ERR_PTR(-ENOSPC);
245
246         page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
247         if (!page)
248                 return ERR_PTR(-ENOMEM);
249
250         return dst->pages[dst->page_count++] = (void *)page;
251 }
252
253 static int compress_page(struct compress *c,
254                          void *src,
255                          struct drm_i915_error_object *dst)
256 {
257         struct z_stream_s *zstream = &c->zstream;
258
259         zstream->next_in = src;
260         if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
261                 zstream->next_in = c->tmp;
262         zstream->avail_in = PAGE_SIZE;
263
264         do {
265                 if (zstream->avail_out == 0) {
266                         zstream->next_out = compress_next_page(dst);
267                         if (IS_ERR(zstream->next_out))
268                                 return PTR_ERR(zstream->next_out);
269
270                         zstream->avail_out = PAGE_SIZE;
271                 }
272
273                 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
274                         return -EIO;
275
276                 touch_nmi_watchdog();
277         } while (zstream->avail_in);
278
279         /* Fallback to uncompressed if we increase size? */
280         if (0 && zstream->total_out > zstream->total_in)
281                 return -E2BIG;
282
283         return 0;
284 }
285
286 static int compress_flush(struct compress *c,
287                           struct drm_i915_error_object *dst)
288 {
289         struct z_stream_s *zstream = &c->zstream;
290
291         do {
292                 switch (zlib_deflate(zstream, Z_FINISH)) {
293                 case Z_OK: /* more space requested */
294                         zstream->next_out = compress_next_page(dst);
295                         if (IS_ERR(zstream->next_out))
296                                 return PTR_ERR(zstream->next_out);
297
298                         zstream->avail_out = PAGE_SIZE;
299                         break;
300
301                 case Z_STREAM_END:
302                         goto end;
303
304                 default: /* any error */
305                         return -EIO;
306                 }
307         } while (1);
308
309 end:
310         memset(zstream->next_out, 0, zstream->avail_out);
311         dst->unused = zstream->avail_out;
312         return 0;
313 }
314
315 static void compress_fini(struct compress *c,
316                           struct drm_i915_error_object *dst)
317 {
318         struct z_stream_s *zstream = &c->zstream;
319
320         zlib_deflateEnd(zstream);
321         kfree(zstream->workspace);
322         if (c->tmp)
323                 free_page((unsigned long)c->tmp);
324 }
325
326 static void err_compression_marker(struct drm_i915_error_state_buf *m)
327 {
328         err_puts(m, ":");
329 }
330
331 #else
332
333 struct compress {
334 };
335
336 static bool compress_init(struct compress *c)
337 {
338         return true;
339 }
340
341 static int compress_page(struct compress *c,
342                          void *src,
343                          struct drm_i915_error_object *dst)
344 {
345         unsigned long page;
346         void *ptr;
347
348         page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
349         if (!page)
350                 return -ENOMEM;
351
352         ptr = (void *)page;
353         if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
354                 memcpy(ptr, src, PAGE_SIZE);
355         dst->pages[dst->page_count++] = ptr;
356
357         return 0;
358 }
359
360 static int compress_flush(struct compress *c,
361                           struct drm_i915_error_object *dst)
362 {
363         return 0;
364 }
365
366 static void compress_fini(struct compress *c,
367                           struct drm_i915_error_object *dst)
368 {
369 }
370
371 static void err_compression_marker(struct drm_i915_error_state_buf *m)
372 {
373         err_puts(m, "~");
374 }
375
376 #endif
377
378 static void print_error_buffers(struct drm_i915_error_state_buf *m,
379                                 const char *name,
380                                 struct drm_i915_error_buffer *err,
381                                 int count)
382 {
383         err_printf(m, "%s [%d]:\n", name, count);
384
385         while (count--) {
386                 err_printf(m, "    %08x_%08x %8u %02x %02x",
387                            upper_32_bits(err->gtt_offset),
388                            lower_32_bits(err->gtt_offset),
389                            err->size,
390                            err->read_domains,
391                            err->write_domain);
392                 err_puts(m, tiling_flag(err->tiling));
393                 err_puts(m, dirty_flag(err->dirty));
394                 err_puts(m, purgeable_flag(err->purgeable));
395                 err_puts(m, err->userptr ? " userptr" : "");
396                 err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
397
398                 if (err->name)
399                         err_printf(m, " (name: %d)", err->name);
400                 if (err->fence_reg != I915_FENCE_REG_NONE)
401                         err_printf(m, " (fence: %d)", err->fence_reg);
402
403                 err_puts(m, "\n");
404                 err++;
405         }
406 }
407
408 static void error_print_instdone(struct drm_i915_error_state_buf *m,
409                                  const struct drm_i915_error_engine *ee)
410 {
411         int slice;
412         int subslice;
413
414         err_printf(m, "  INSTDONE: 0x%08x\n",
415                    ee->instdone.instdone);
416
417         if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
418                 return;
419
420         err_printf(m, "  SC_INSTDONE: 0x%08x\n",
421                    ee->instdone.slice_common);
422
423         if (INTEL_GEN(m->i915) <= 6)
424                 return;
425
426         for_each_instdone_slice_subslice(m->i915, slice, subslice)
427                 err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
428                            slice, subslice,
429                            ee->instdone.sampler[slice][subslice]);
430
431         for_each_instdone_slice_subslice(m->i915, slice, subslice)
432                 err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
433                            slice, subslice,
434                            ee->instdone.row[slice][subslice]);
435 }
436
437 static void error_print_request(struct drm_i915_error_state_buf *m,
438                                 const char *prefix,
439                                 const struct drm_i915_error_request *erq,
440                                 const unsigned long epoch)
441 {
442         if (!erq->seqno)
443                 return;
444
445         err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
446                    prefix, erq->pid, erq->context, erq->seqno,
447                    test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
448                             &erq->flags) ? "!" : "",
449                    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
450                             &erq->flags) ? "+" : "",
451                    erq->sched_attr.priority,
452                    jiffies_to_msecs(erq->jiffies - epoch),
453                    erq->start, erq->head, erq->tail);
454 }
455
456 static void error_print_context(struct drm_i915_error_state_buf *m,
457                                 const char *header,
458                                 const struct drm_i915_error_context *ctx)
459 {
460         err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n",
461                    header, ctx->comm, ctx->pid, ctx->hw_id,
462                    ctx->sched_attr.priority, ctx->guilty, ctx->active);
463 }
464
465 static void error_print_engine(struct drm_i915_error_state_buf *m,
466                                const struct drm_i915_error_engine *ee,
467                                const unsigned long epoch)
468 {
469         int n;
470
471         err_printf(m, "%s command stream:\n",
472                    engine_name(m->i915, ee->engine_id));
473         err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
474         err_printf(m, "  START: 0x%08x\n", ee->start);
475         err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
476         err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
477                    ee->tail, ee->rq_post, ee->rq_tail);
478         err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
479         err_printf(m, "  MODE:  0x%08x\n", ee->mode);
480         err_printf(m, "  HWS:   0x%08x\n", ee->hws);
481         err_printf(m, "  ACTHD: 0x%08x %08x\n",
482                    (u32)(ee->acthd>>32), (u32)ee->acthd);
483         err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
484         err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
485
486         error_print_instdone(m, ee);
487
488         if (ee->batchbuffer) {
489                 u64 start = ee->batchbuffer->gtt_offset;
490                 u64 end = start + ee->batchbuffer->gtt_size;
491
492                 err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
493                            upper_32_bits(start), lower_32_bits(start),
494                            upper_32_bits(end), lower_32_bits(end));
495         }
496         if (INTEL_GEN(m->i915) >= 4) {
497                 err_printf(m, "  BBADDR: 0x%08x_%08x\n",
498                            (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
499                 err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
500                 err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
501         }
502         err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
503         err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
504                    lower_32_bits(ee->faddr));
505         if (INTEL_GEN(m->i915) >= 6) {
506                 err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
507                 err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
508         }
509         if (HAS_PPGTT(m->i915)) {
510                 err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
511
512                 if (INTEL_GEN(m->i915) >= 8) {
513                         int i;
514                         for (i = 0; i < 4; i++)
515                                 err_printf(m, "  PDP%d: 0x%016llx\n",
516                                            i, ee->vm_info.pdp[i]);
517                 } else {
518                         err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
519                                    ee->vm_info.pp_dir_base);
520                 }
521         }
522         err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
523         err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
524         err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
525                    jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
526                    ee->hangcheck_timestamp,
527                    ee->hangcheck_timestamp == epoch ? "; epoch" : "");
528         err_printf(m, "  engine reset count: %u\n", ee->reset_count);
529
530         for (n = 0; n < ee->num_ports; n++) {
531                 err_printf(m, "  ELSP[%d]:", n);
532                 error_print_request(m, " ", &ee->execlist[n], epoch);
533         }
534
535         error_print_context(m, "  Active context: ", &ee->context);
536 }
537
538 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
539 {
540         va_list args;
541
542         va_start(args, f);
543         i915_error_vprintf(e, f, args);
544         va_end(args);
545 }
546
547 static void print_error_obj(struct drm_i915_error_state_buf *m,
548                             struct intel_engine_cs *engine,
549                             const char *name,
550                             struct drm_i915_error_object *obj)
551 {
552         char out[ASCII85_BUFSZ];
553         int page;
554
555         if (!obj)
556                 return;
557
558         if (name) {
559                 err_printf(m, "%s --- %s = 0x%08x %08x\n",
560                            engine ? engine->name : "global", name,
561                            upper_32_bits(obj->gtt_offset),
562                            lower_32_bits(obj->gtt_offset));
563         }
564
565         err_compression_marker(m);
566         for (page = 0; page < obj->page_count; page++) {
567                 int i, len;
568
569                 len = PAGE_SIZE;
570                 if (page == obj->page_count - 1)
571                         len -= obj->unused;
572                 len = ascii85_encode_len(len);
573
574                 for (i = 0; i < len; i++)
575                         err_puts(m, ascii85_encode(obj->pages[page][i], out));
576         }
577         err_puts(m, "\n");
578 }
579
580 static void err_print_capabilities(struct drm_i915_error_state_buf *m,
581                                    const struct intel_device_info *info,
582                                    const struct intel_runtime_info *runtime,
583                                    const struct intel_driver_caps *caps)
584 {
585         struct drm_printer p = i915_error_printer(m);
586
587         intel_device_info_dump_flags(info, &p);
588         intel_driver_caps_print(caps, &p);
589         intel_device_info_dump_topology(&runtime->sseu, &p);
590 }
591
592 static void err_print_params(struct drm_i915_error_state_buf *m,
593                              const struct i915_params *params)
594 {
595         struct drm_printer p = i915_error_printer(m);
596
597         i915_params_dump(params, &p);
598 }
599
600 static void err_print_pciid(struct drm_i915_error_state_buf *m,
601                             struct drm_i915_private *i915)
602 {
603         struct pci_dev *pdev = i915->drm.pdev;
604
605         err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
606         err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
607         err_printf(m, "PCI Subsystem: %04x:%04x\n",
608                    pdev->subsystem_vendor,
609                    pdev->subsystem_device);
610 }
611
612 static void err_print_uc(struct drm_i915_error_state_buf *m,
613                          const struct i915_error_uc *error_uc)
614 {
615         struct drm_printer p = i915_error_printer(m);
616         const struct i915_gpu_state *error =
617                 container_of(error_uc, typeof(*error), uc);
618
619         if (!error->device_info.has_guc)
620                 return;
621
622         intel_uc_fw_dump(&error_uc->guc_fw, &p);
623         intel_uc_fw_dump(&error_uc->huc_fw, &p);
624         print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
625 }
626
627 static void err_free_sgl(struct scatterlist *sgl)
628 {
629         while (sgl) {
630                 struct scatterlist *sg;
631
632                 for (sg = sgl; !sg_is_chain(sg); sg++) {
633                         kfree(sg_virt(sg));
634                         if (sg_is_last(sg))
635                                 break;
636                 }
637
638                 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
639                 free_page((unsigned long)sgl);
640                 sgl = sg;
641         }
642 }
643
644 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
645                                struct i915_gpu_state *error)
646 {
647         struct drm_i915_error_object *obj;
648         struct timespec64 ts;
649         int i, j;
650
651         if (*error->error_msg)
652                 err_printf(m, "%s\n", error->error_msg);
653         err_printf(m, "Kernel: %s %s\n",
654                    init_utsname()->release,
655                    init_utsname()->machine);
656         ts = ktime_to_timespec64(error->time);
657         err_printf(m, "Time: %lld s %ld us\n",
658                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
659         ts = ktime_to_timespec64(error->boottime);
660         err_printf(m, "Boottime: %lld s %ld us\n",
661                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
662         ts = ktime_to_timespec64(error->uptime);
663         err_printf(m, "Uptime: %lld s %ld us\n",
664                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
665         err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
666         err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
667                    error->capture,
668                    jiffies_to_msecs(jiffies - error->capture),
669                    jiffies_to_msecs(error->capture - error->epoch));
670
671         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
672                 if (!error->engine[i].context.pid)
673                         continue;
674
675                 err_printf(m, "Active process (on ring %s): %s [%d]\n",
676                            engine_name(m->i915, i),
677                            error->engine[i].context.comm,
678                            error->engine[i].context.pid);
679         }
680         err_printf(m, "Reset count: %u\n", error->reset_count);
681         err_printf(m, "Suspend count: %u\n", error->suspend_count);
682         err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
683         err_printf(m, "Subplatform: 0x%x\n",
684                    intel_subplatform(&error->runtime_info,
685                                      error->device_info.platform));
686         err_print_pciid(m, m->i915);
687
688         err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
689
690         if (HAS_CSR(m->i915)) {
691                 struct intel_csr *csr = &m->i915->csr;
692
693                 err_printf(m, "DMC loaded: %s\n",
694                            yesno(csr->dmc_payload != NULL));
695                 err_printf(m, "DMC fw version: %d.%d\n",
696                            CSR_VERSION_MAJOR(csr->version),
697                            CSR_VERSION_MINOR(csr->version));
698         }
699
700         err_printf(m, "GT awake: %s\n", yesno(error->awake));
701         err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
702         err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
703         err_printf(m, "EIR: 0x%08x\n", error->eir);
704         err_printf(m, "IER: 0x%08x\n", error->ier);
705         for (i = 0; i < error->ngtier; i++)
706                 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
707         err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
708         err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
709         err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
710         err_printf(m, "CCID: 0x%08x\n", error->ccid);
711
712         for (i = 0; i < error->nfence; i++)
713                 err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
714
715         if (INTEL_GEN(m->i915) >= 6) {
716                 err_printf(m, "ERROR: 0x%08x\n", error->error);
717
718                 if (INTEL_GEN(m->i915) >= 8)
719                         err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
720                                    error->fault_data1, error->fault_data0);
721
722                 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
723         }
724
725         if (IS_GEN(m->i915, 7))
726                 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
727
728         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
729                 if (error->engine[i].engine_id != -1)
730                         error_print_engine(m, &error->engine[i], error->epoch);
731         }
732
733         for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
734                 char buf[128];
735                 int len, first = 1;
736
737                 if (!error->active_vm[i])
738                         break;
739
740                 len = scnprintf(buf, sizeof(buf), "Active (");
741                 for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
742                         if (error->engine[j].vm != error->active_vm[i])
743                                 continue;
744
745                         len += scnprintf(buf + len, sizeof(buf), "%s%s",
746                                          first ? "" : ", ",
747                                          m->i915->engine[j]->name);
748                         first = 0;
749                 }
750                 scnprintf(buf + len, sizeof(buf), ")");
751                 print_error_buffers(m, buf,
752                                     error->active_bo[i],
753                                     error->active_bo_count[i]);
754         }
755
756         print_error_buffers(m, "Pinned (global)",
757                             error->pinned_bo,
758                             error->pinned_bo_count);
759
760         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
761                 const struct drm_i915_error_engine *ee = &error->engine[i];
762
763                 obj = ee->batchbuffer;
764                 if (obj) {
765                         err_puts(m, m->i915->engine[i]->name);
766                         if (ee->context.pid)
767                                 err_printf(m, " (submitted by %s [%d])",
768                                            ee->context.comm,
769                                            ee->context.pid);
770                         err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
771                                    upper_32_bits(obj->gtt_offset),
772                                    lower_32_bits(obj->gtt_offset));
773                         print_error_obj(m, m->i915->engine[i], NULL, obj);
774                 }
775
776                 for (j = 0; j < ee->user_bo_count; j++)
777                         print_error_obj(m, m->i915->engine[i],
778                                         "user", ee->user_bo[j]);
779
780                 if (ee->num_requests) {
781                         err_printf(m, "%s --- %d requests\n",
782                                    m->i915->engine[i]->name,
783                                    ee->num_requests);
784                         for (j = 0; j < ee->num_requests; j++)
785                                 error_print_request(m, " ",
786                                                     &ee->requests[j],
787                                                     error->epoch);
788                 }
789
790                 print_error_obj(m, m->i915->engine[i],
791                                 "ringbuffer", ee->ringbuffer);
792
793                 print_error_obj(m, m->i915->engine[i],
794                                 "HW Status", ee->hws_page);
795
796                 print_error_obj(m, m->i915->engine[i],
797                                 "HW context", ee->ctx);
798
799                 print_error_obj(m, m->i915->engine[i],
800                                 "WA context", ee->wa_ctx);
801
802                 print_error_obj(m, m->i915->engine[i],
803                                 "WA batchbuffer", ee->wa_batchbuffer);
804
805                 print_error_obj(m, m->i915->engine[i],
806                                 "NULL context", ee->default_state);
807         }
808
809         if (error->overlay)
810                 intel_overlay_print_error_state(m, error->overlay);
811
812         if (error->display)
813                 intel_display_print_error_state(m, error->display);
814
815         err_print_capabilities(m, &error->device_info, &error->runtime_info,
816                                &error->driver_caps);
817         err_print_params(m, &error->params);
818         err_print_uc(m, &error->uc);
819 }
820
821 static int err_print_to_sgl(struct i915_gpu_state *error)
822 {
823         struct drm_i915_error_state_buf m;
824
825         if (IS_ERR(error))
826                 return PTR_ERR(error);
827
828         if (READ_ONCE(error->sgl))
829                 return 0;
830
831         memset(&m, 0, sizeof(m));
832         m.i915 = error->i915;
833
834         __err_print_to_sgl(&m, error);
835
836         if (m.buf) {
837                 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
838                 m.bytes = 0;
839                 m.buf = NULL;
840         }
841         if (m.cur) {
842                 GEM_BUG_ON(m.end < m.cur);
843                 sg_mark_end(m.cur - 1);
844         }
845         GEM_BUG_ON(m.sgl && !m.cur);
846
847         if (m.err) {
848                 err_free_sgl(m.sgl);
849                 return m.err;
850         }
851
852         if (cmpxchg(&error->sgl, NULL, m.sgl))
853                 err_free_sgl(m.sgl);
854
855         return 0;
856 }
857
858 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
859                                       char *buf, loff_t off, size_t rem)
860 {
861         struct scatterlist *sg;
862         size_t count;
863         loff_t pos;
864         int err;
865
866         if (!error || !rem)
867                 return 0;
868
869         err = err_print_to_sgl(error);
870         if (err)
871                 return err;
872
873         sg = READ_ONCE(error->fit);
874         if (!sg || off < sg->dma_address)
875                 sg = error->sgl;
876         if (!sg)
877                 return 0;
878
879         pos = sg->dma_address;
880         count = 0;
881         do {
882                 size_t len, start;
883
884                 if (sg_is_chain(sg)) {
885                         sg = sg_chain_ptr(sg);
886                         GEM_BUG_ON(sg_is_chain(sg));
887                 }
888
889                 len = sg->length;
890                 if (pos + len <= off) {
891                         pos += len;
892                         continue;
893                 }
894
895                 start = sg->offset;
896                 if (pos < off) {
897                         GEM_BUG_ON(off - pos > len);
898                         len -= off - pos;
899                         start += off - pos;
900                         pos = off;
901                 }
902
903                 len = min(len, rem);
904                 GEM_BUG_ON(!len || len > sg->length);
905
906                 memcpy(buf, page_address(sg_page(sg)) + start, len);
907
908                 count += len;
909                 pos += len;
910
911                 buf += len;
912                 rem -= len;
913                 if (!rem) {
914                         WRITE_ONCE(error->fit, sg);
915                         break;
916                 }
917         } while (!sg_is_last(sg++));
918
919         return count;
920 }
921
922 static void i915_error_object_free(struct drm_i915_error_object *obj)
923 {
924         int page;
925
926         if (obj == NULL)
927                 return;
928
929         for (page = 0; page < obj->page_count; page++)
930                 free_page((unsigned long)obj->pages[page]);
931
932         kfree(obj);
933 }
934
935
936 static void cleanup_params(struct i915_gpu_state *error)
937 {
938         i915_params_free(&error->params);
939 }
940
941 static void cleanup_uc_state(struct i915_gpu_state *error)
942 {
943         struct i915_error_uc *error_uc = &error->uc;
944
945         kfree(error_uc->guc_fw.path);
946         kfree(error_uc->huc_fw.path);
947         i915_error_object_free(error_uc->guc_log);
948 }
949
950 void __i915_gpu_state_free(struct kref *error_ref)
951 {
952         struct i915_gpu_state *error =
953                 container_of(error_ref, typeof(*error), ref);
954         long i, j;
955
956         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
957                 struct drm_i915_error_engine *ee = &error->engine[i];
958
959                 for (j = 0; j < ee->user_bo_count; j++)
960                         i915_error_object_free(ee->user_bo[j]);
961                 kfree(ee->user_bo);
962
963                 i915_error_object_free(ee->batchbuffer);
964                 i915_error_object_free(ee->wa_batchbuffer);
965                 i915_error_object_free(ee->ringbuffer);
966                 i915_error_object_free(ee->hws_page);
967                 i915_error_object_free(ee->ctx);
968                 i915_error_object_free(ee->wa_ctx);
969
970                 kfree(ee->requests);
971         }
972
973         for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
974                 kfree(error->active_bo[i]);
975         kfree(error->pinned_bo);
976
977         kfree(error->overlay);
978         kfree(error->display);
979
980         cleanup_params(error);
981         cleanup_uc_state(error);
982
983         err_free_sgl(error->sgl);
984         kfree(error);
985 }
986
987 static struct drm_i915_error_object *
988 i915_error_object_create(struct drm_i915_private *i915,
989                          struct i915_vma *vma)
990 {
991         struct i915_ggtt *ggtt = &i915->ggtt;
992         const u64 slot = ggtt->error_capture.start;
993         struct drm_i915_error_object *dst;
994         struct compress compress;
995         unsigned long num_pages;
996         struct sgt_iter iter;
997         dma_addr_t dma;
998         int ret;
999
1000         if (!vma || !vma->pages)
1001                 return NULL;
1002
1003         num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1004         num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1005         dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
1006                       GFP_ATOMIC | __GFP_NOWARN);
1007         if (!dst)
1008                 return NULL;
1009
1010         dst->gtt_offset = vma->node.start;
1011         dst->gtt_size = vma->node.size;
1012         dst->num_pages = num_pages;
1013         dst->page_count = 0;
1014         dst->unused = 0;
1015
1016         if (!compress_init(&compress)) {
1017                 kfree(dst);
1018                 return NULL;
1019         }
1020
1021         ret = -EINVAL;
1022         for_each_sgt_dma(dma, iter, vma->pages) {
1023                 void __iomem *s;
1024
1025                 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1026
1027                 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1028                 ret = compress_page(&compress, (void  __force *)s, dst);
1029                 io_mapping_unmap_atomic(s);
1030                 if (ret)
1031                         break;
1032         }
1033
1034         if (ret || compress_flush(&compress, dst)) {
1035                 while (dst->page_count--)
1036                         free_page((unsigned long)dst->pages[dst->page_count]);
1037                 kfree(dst);
1038                 dst = NULL;
1039         }
1040
1041         compress_fini(&compress, dst);
1042         return dst;
1043 }
1044
1045 static void capture_bo(struct drm_i915_error_buffer *err,
1046                        struct i915_vma *vma)
1047 {
1048         struct drm_i915_gem_object *obj = vma->obj;
1049
1050         err->size = obj->base.size;
1051         err->name = obj->base.name;
1052
1053         err->gtt_offset = vma->node.start;
1054         err->read_domains = obj->read_domains;
1055         err->write_domain = obj->write_domain;
1056         err->fence_reg = vma->fence ? vma->fence->id : -1;
1057         err->tiling = i915_gem_object_get_tiling(obj);
1058         err->dirty = obj->mm.dirty;
1059         err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1060         err->userptr = obj->userptr.mm != NULL;
1061         err->cache_level = obj->cache_level;
1062 }
1063
1064 static u32 capture_error_bo(struct drm_i915_error_buffer *err,
1065                             int count, struct list_head *head,
1066                             unsigned int flags)
1067 #define ACTIVE_ONLY BIT(0)
1068 #define PINNED_ONLY BIT(1)
1069 {
1070         struct i915_vma *vma;
1071         int i = 0;
1072
1073         list_for_each_entry(vma, head, vm_link) {
1074                 if (!vma->obj)
1075                         continue;
1076
1077                 if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
1078                         continue;
1079
1080                 if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
1081                         continue;
1082
1083                 capture_bo(err++, vma);
1084                 if (++i == count)
1085                         break;
1086         }
1087
1088         return i;
1089 }
1090
1091 /*
1092  * Generate a semi-unique error code. The code is not meant to have meaning, The
1093  * code's only purpose is to try to prevent false duplicated bug reports by
1094  * grossly estimating a GPU error state.
1095  *
1096  * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1097  * the hang if we could strip the GTT offset information from it.
1098  *
1099  * It's only a small step better than a random number in its current form.
1100  */
1101 static u32 i915_error_generate_code(struct i915_gpu_state *error,
1102                                     intel_engine_mask_t engine_mask)
1103 {
1104         /*
1105          * IPEHR would be an ideal way to detect errors, as it's the gross
1106          * measure of "the command that hung." However, has some very common
1107          * synchronization commands which almost always appear in the case
1108          * strictly a client bug. Use instdone to differentiate those some.
1109          */
1110         if (engine_mask) {
1111                 struct drm_i915_error_engine *ee =
1112                         &error->engine[ffs(engine_mask)];
1113
1114                 return ee->ipehr ^ ee->instdone.instdone;
1115         }
1116
1117         return 0;
1118 }
1119
1120 static void gem_record_fences(struct i915_gpu_state *error)
1121 {
1122         struct drm_i915_private *dev_priv = error->i915;
1123         int i;
1124
1125         if (INTEL_GEN(dev_priv) >= 6) {
1126                 for (i = 0; i < dev_priv->num_fence_regs; i++)
1127                         error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
1128         } else if (INTEL_GEN(dev_priv) >= 4) {
1129                 for (i = 0; i < dev_priv->num_fence_regs; i++)
1130                         error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
1131         } else {
1132                 for (i = 0; i < dev_priv->num_fence_regs; i++)
1133                         error->fence[i] = I915_READ(FENCE_REG(i));
1134         }
1135         error->nfence = i;
1136 }
1137
1138 static void error_record_engine_registers(struct i915_gpu_state *error,
1139                                           struct intel_engine_cs *engine,
1140                                           struct drm_i915_error_engine *ee)
1141 {
1142         struct drm_i915_private *dev_priv = engine->i915;
1143
1144         if (INTEL_GEN(dev_priv) >= 6) {
1145                 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1146                 if (INTEL_GEN(dev_priv) >= 8)
1147                         ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1148                 else
1149                         ee->fault_reg = I915_READ(RING_FAULT_REG(engine));
1150         }
1151
1152         if (INTEL_GEN(dev_priv) >= 4) {
1153                 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1154                 ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1155                 ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1156                 ee->instps = ENGINE_READ(engine, RING_INSTPS);
1157                 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1158                 if (INTEL_GEN(dev_priv) >= 8) {
1159                         ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1160                         ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1161                 }
1162                 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1163         } else {
1164                 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1165                 ee->ipeir = ENGINE_READ(engine, IPEIR);
1166                 ee->ipehr = ENGINE_READ(engine, IPEHR);
1167         }
1168
1169         intel_engine_get_instdone(engine, &ee->instdone);
1170
1171         ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1172         ee->acthd = intel_engine_get_active_head(engine);
1173         ee->start = ENGINE_READ(engine, RING_START);
1174         ee->head = ENGINE_READ(engine, RING_HEAD);
1175         ee->tail = ENGINE_READ(engine, RING_TAIL);
1176         ee->ctl = ENGINE_READ(engine, RING_CTL);
1177         if (INTEL_GEN(dev_priv) > 2)
1178                 ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1179
1180         if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1181                 i915_reg_t mmio;
1182
1183                 if (IS_GEN(dev_priv, 7)) {
1184                         switch (engine->id) {
1185                         default:
1186                                 MISSING_CASE(engine->id);
1187                         case RCS0:
1188                                 mmio = RENDER_HWS_PGA_GEN7;
1189                                 break;
1190                         case BCS0:
1191                                 mmio = BLT_HWS_PGA_GEN7;
1192                                 break;
1193                         case VCS0:
1194                                 mmio = BSD_HWS_PGA_GEN7;
1195                                 break;
1196                         case VECS0:
1197                                 mmio = VEBOX_HWS_PGA_GEN7;
1198                                 break;
1199                         }
1200                 } else if (IS_GEN(engine->i915, 6)) {
1201                         mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1202                 } else {
1203                         /* XXX: gen8 returns to sanity */
1204                         mmio = RING_HWS_PGA(engine->mmio_base);
1205                 }
1206
1207                 ee->hws = I915_READ(mmio);
1208         }
1209
1210         ee->idle = intel_engine_is_idle(engine);
1211         if (!ee->idle)
1212                 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1213         ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1214                                                   engine);
1215
1216         if (HAS_PPGTT(dev_priv)) {
1217                 int i;
1218
1219                 ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine));
1220
1221                 if (IS_GEN(dev_priv, 6)) {
1222                         ee->vm_info.pp_dir_base =
1223                                 ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1224                 } else if (IS_GEN(dev_priv, 7)) {
1225                         ee->vm_info.pp_dir_base =
1226                                 ENGINE_READ(engine, RING_PP_DIR_BASE);
1227                 } else if (INTEL_GEN(dev_priv) >= 8) {
1228                         u32 base = engine->mmio_base;
1229
1230                         for (i = 0; i < 4; i++) {
1231                                 ee->vm_info.pdp[i] =
1232                                         I915_READ(GEN8_RING_PDP_UDW(base, i));
1233                                 ee->vm_info.pdp[i] <<= 32;
1234                                 ee->vm_info.pdp[i] |=
1235                                         I915_READ(GEN8_RING_PDP_LDW(base, i));
1236                         }
1237                 }
1238         }
1239 }
1240
1241 static void record_request(struct i915_request *request,
1242                            struct drm_i915_error_request *erq)
1243 {
1244         struct i915_gem_context *ctx = request->gem_context;
1245
1246         erq->flags = request->fence.flags;
1247         erq->context = request->fence.context;
1248         erq->seqno = request->fence.seqno;
1249         erq->sched_attr = request->sched.attr;
1250         erq->jiffies = request->emitted_jiffies;
1251         erq->start = i915_ggtt_offset(request->ring->vma);
1252         erq->head = request->head;
1253         erq->tail = request->tail;
1254
1255         rcu_read_lock();
1256         erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1257         rcu_read_unlock();
1258 }
1259
1260 static void engine_record_requests(struct intel_engine_cs *engine,
1261                                    struct i915_request *first,
1262                                    struct drm_i915_error_engine *ee)
1263 {
1264         struct i915_request *request;
1265         int count;
1266
1267         count = 0;
1268         request = first;
1269         list_for_each_entry_from(request, &engine->timeline.requests, link)
1270                 count++;
1271         if (!count)
1272                 return;
1273
1274         ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
1275         if (!ee->requests)
1276                 return;
1277
1278         ee->num_requests = count;
1279
1280         count = 0;
1281         request = first;
1282         list_for_each_entry_from(request, &engine->timeline.requests, link) {
1283                 if (count >= ee->num_requests) {
1284                         /*
1285                          * If the ring request list was changed in
1286                          * between the point where the error request
1287                          * list was created and dimensioned and this
1288                          * point then just exit early to avoid crashes.
1289                          *
1290                          * We don't need to communicate that the
1291                          * request list changed state during error
1292                          * state capture and that the error state is
1293                          * slightly incorrect as a consequence since we
1294                          * are typically only interested in the request
1295                          * list state at the point of error state
1296                          * capture, not in any changes happening during
1297                          * the capture.
1298                          */
1299                         break;
1300                 }
1301
1302                 record_request(request, &ee->requests[count++]);
1303         }
1304         ee->num_requests = count;
1305 }
1306
1307 static void error_record_engine_execlists(struct intel_engine_cs *engine,
1308                                           struct drm_i915_error_engine *ee)
1309 {
1310         const struct intel_engine_execlists * const execlists = &engine->execlists;
1311         unsigned int n;
1312
1313         for (n = 0; n < execlists_num_ports(execlists); n++) {
1314                 struct i915_request *rq = port_request(&execlists->port[n]);
1315
1316                 if (!rq)
1317                         break;
1318
1319                 record_request(rq, &ee->execlist[n]);
1320         }
1321
1322         ee->num_ports = n;
1323 }
1324
1325 static void record_context(struct drm_i915_error_context *e,
1326                            struct i915_gem_context *ctx)
1327 {
1328         if (ctx->pid) {
1329                 struct task_struct *task;
1330
1331                 rcu_read_lock();
1332                 task = pid_task(ctx->pid, PIDTYPE_PID);
1333                 if (task) {
1334                         strcpy(e->comm, task->comm);
1335                         e->pid = task->pid;
1336                 }
1337                 rcu_read_unlock();
1338         }
1339
1340         e->hw_id = ctx->hw_id;
1341         e->sched_attr = ctx->sched;
1342         e->guilty = atomic_read(&ctx->guilty_count);
1343         e->active = atomic_read(&ctx->active_count);
1344 }
1345
1346 static void request_record_user_bo(struct i915_request *request,
1347                                    struct drm_i915_error_engine *ee)
1348 {
1349         struct i915_capture_list *c;
1350         struct drm_i915_error_object **bo;
1351         long count, max;
1352
1353         max = 0;
1354         for (c = request->capture_list; c; c = c->next)
1355                 max++;
1356         if (!max)
1357                 return;
1358
1359         bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1360         if (!bo) {
1361                 /* If we can't capture everything, try to capture something. */
1362                 max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1363                 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1364         }
1365         if (!bo)
1366                 return;
1367
1368         count = 0;
1369         for (c = request->capture_list; c; c = c->next) {
1370                 bo[count] = i915_error_object_create(request->i915, c->vma);
1371                 if (!bo[count])
1372                         break;
1373                 if (++count == max)
1374                         break;
1375         }
1376
1377         ee->user_bo = bo;
1378         ee->user_bo_count = count;
1379 }
1380
1381 static struct drm_i915_error_object *
1382 capture_object(struct drm_i915_private *dev_priv,
1383                struct drm_i915_gem_object *obj)
1384 {
1385         if (obj && i915_gem_object_has_pages(obj)) {
1386                 struct i915_vma fake = {
1387                         .node = { .start = U64_MAX, .size = obj->base.size },
1388                         .size = obj->base.size,
1389                         .pages = obj->mm.pages,
1390                         .obj = obj,
1391                 };
1392
1393                 return i915_error_object_create(dev_priv, &fake);
1394         } else {
1395                 return NULL;
1396         }
1397 }
1398
1399 static void gem_record_rings(struct i915_gpu_state *error)
1400 {
1401         struct drm_i915_private *i915 = error->i915;
1402         struct i915_ggtt *ggtt = &i915->ggtt;
1403         int i;
1404
1405         for (i = 0; i < I915_NUM_ENGINES; i++) {
1406                 struct intel_engine_cs *engine = i915->engine[i];
1407                 struct drm_i915_error_engine *ee = &error->engine[i];
1408                 struct i915_request *request;
1409
1410                 ee->engine_id = -1;
1411
1412                 if (!engine)
1413                         continue;
1414
1415                 ee->engine_id = i;
1416
1417                 error_record_engine_registers(error, engine, ee);
1418                 error_record_engine_execlists(engine, ee);
1419
1420                 request = intel_engine_find_active_request(engine);
1421                 if (request) {
1422                         struct i915_gem_context *ctx = request->gem_context;
1423                         struct intel_ring *ring;
1424
1425                         ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm;
1426
1427                         record_context(&ee->context, ctx);
1428
1429                         /* We need to copy these to an anonymous buffer
1430                          * as the simplest method to avoid being overwritten
1431                          * by userspace.
1432                          */
1433                         ee->batchbuffer =
1434                                 i915_error_object_create(i915, request->batch);
1435
1436                         if (HAS_BROKEN_CS_TLB(i915))
1437                                 ee->wa_batchbuffer =
1438                                         i915_error_object_create(i915,
1439                                                                  i915->gt.scratch);
1440                         request_record_user_bo(request, ee);
1441
1442                         ee->ctx =
1443                                 i915_error_object_create(i915,
1444                                                          request->hw_context->state);
1445
1446                         error->simulated |=
1447                                 i915_gem_context_no_error_capture(ctx);
1448
1449                         ee->rq_head = request->head;
1450                         ee->rq_post = request->postfix;
1451                         ee->rq_tail = request->tail;
1452
1453                         ring = request->ring;
1454                         ee->cpu_ring_head = ring->head;
1455                         ee->cpu_ring_tail = ring->tail;
1456                         ee->ringbuffer =
1457                                 i915_error_object_create(i915, ring->vma);
1458
1459                         engine_record_requests(engine, request, ee);
1460                 }
1461
1462                 ee->hws_page =
1463                         i915_error_object_create(i915,
1464                                                  engine->status_page.vma);
1465
1466                 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1467
1468                 ee->default_state = capture_object(i915, engine->default_state);
1469         }
1470 }
1471
1472 static void gem_capture_vm(struct i915_gpu_state *error,
1473                            struct i915_address_space *vm,
1474                            int idx)
1475 {
1476         struct drm_i915_error_buffer *active_bo;
1477         struct i915_vma *vma;
1478         int count;
1479
1480         count = 0;
1481         list_for_each_entry(vma, &vm->bound_list, vm_link)
1482                 if (i915_vma_is_active(vma))
1483                         count++;
1484
1485         active_bo = NULL;
1486         if (count)
1487                 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1488         if (active_bo)
1489                 count = capture_error_bo(active_bo,
1490                                          count, &vm->bound_list,
1491                                          ACTIVE_ONLY);
1492         else
1493                 count = 0;
1494
1495         error->active_vm[idx] = vm;
1496         error->active_bo[idx] = active_bo;
1497         error->active_bo_count[idx] = count;
1498 }
1499
1500 static void capture_active_buffers(struct i915_gpu_state *error)
1501 {
1502         int cnt = 0, i, j;
1503
1504         BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
1505         BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
1506         BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
1507
1508         /* Scan each engine looking for unique active contexts/vm */
1509         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1510                 struct drm_i915_error_engine *ee = &error->engine[i];
1511                 bool found;
1512
1513                 if (!ee->vm)
1514                         continue;
1515
1516                 found = false;
1517                 for (j = 0; j < i && !found; j++)
1518                         found = error->engine[j].vm == ee->vm;
1519                 if (!found)
1520                         gem_capture_vm(error, ee->vm, cnt++);
1521         }
1522 }
1523
1524 static void capture_pinned_buffers(struct i915_gpu_state *error)
1525 {
1526         struct i915_address_space *vm = &error->i915->ggtt.vm;
1527         struct drm_i915_error_buffer *bo;
1528         struct i915_vma *vma;
1529         int count;
1530
1531         count = 0;
1532         list_for_each_entry(vma, &vm->bound_list, vm_link)
1533                 count++;
1534
1535         bo = NULL;
1536         if (count)
1537                 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
1538         if (!bo)
1539                 return;
1540
1541         error->pinned_bo_count =
1542                 capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
1543         error->pinned_bo = bo;
1544 }
1545
1546 static void capture_uc_state(struct i915_gpu_state *error)
1547 {
1548         struct drm_i915_private *i915 = error->i915;
1549         struct i915_error_uc *error_uc = &error->uc;
1550
1551         /* Capturing uC state won't be useful if there is no GuC */
1552         if (!error->device_info.has_guc)
1553                 return;
1554
1555         error_uc->guc_fw = i915->guc.fw;
1556         error_uc->huc_fw = i915->huc.fw;
1557
1558         /* Non-default firmware paths will be specified by the modparam.
1559          * As modparams are generally accesible from the userspace make
1560          * explicit copies of the firmware paths.
1561          */
1562         error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
1563         error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1564         error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1565 }
1566
1567 /* Capture all registers which don't fit into another category. */
1568 static void capture_reg_state(struct i915_gpu_state *error)
1569 {
1570         struct drm_i915_private *dev_priv = error->i915;
1571         int i;
1572
1573         /* General organization
1574          * 1. Registers specific to a single generation
1575          * 2. Registers which belong to multiple generations
1576          * 3. Feature specific registers.
1577          * 4. Everything else
1578          * Please try to follow the order.
1579          */
1580
1581         /* 1: Registers specific to a single generation */
1582         if (IS_VALLEYVIEW(dev_priv)) {
1583                 error->gtier[0] = I915_READ(GTIER);
1584                 error->ier = I915_READ(VLV_IER);
1585                 error->forcewake = I915_READ_FW(FORCEWAKE_VLV);
1586         }
1587
1588         if (IS_GEN(dev_priv, 7))
1589                 error->err_int = I915_READ(GEN7_ERR_INT);
1590
1591         if (INTEL_GEN(dev_priv) >= 8) {
1592                 error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0);
1593                 error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1);
1594         }
1595
1596         if (IS_GEN(dev_priv, 6)) {
1597                 error->forcewake = I915_READ_FW(FORCEWAKE);
1598                 error->gab_ctl = I915_READ(GAB_CTL);
1599                 error->gfx_mode = I915_READ(GFX_MODE);
1600         }
1601
1602         /* 2: Registers which belong to multiple generations */
1603         if (INTEL_GEN(dev_priv) >= 7)
1604                 error->forcewake = I915_READ_FW(FORCEWAKE_MT);
1605
1606         if (INTEL_GEN(dev_priv) >= 6) {
1607                 error->derrmr = I915_READ(DERRMR);
1608                 error->error = I915_READ(ERROR_GEN6);
1609                 error->done_reg = I915_READ(DONE_REG);
1610         }
1611
1612         if (INTEL_GEN(dev_priv) >= 5)
1613                 error->ccid = I915_READ(CCID(RENDER_RING_BASE));
1614
1615         /* 3: Feature specific registers */
1616         if (IS_GEN_RANGE(dev_priv, 6, 7)) {
1617                 error->gam_ecochk = I915_READ(GAM_ECOCHK);
1618                 error->gac_eco = I915_READ(GAC_ECO_BITS);
1619         }
1620
1621         /* 4: Everything else */
1622         if (INTEL_GEN(dev_priv) >= 11) {
1623                 error->ier = I915_READ(GEN8_DE_MISC_IER);
1624                 error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE);
1625                 error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE);
1626                 error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE);
1627                 error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1628                 error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE);
1629                 error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE);
1630                 error->ngtier = 6;
1631         } else if (INTEL_GEN(dev_priv) >= 8) {
1632                 error->ier = I915_READ(GEN8_DE_MISC_IER);
1633                 for (i = 0; i < 4; i++)
1634                         error->gtier[i] = I915_READ(GEN8_GT_IER(i));
1635                 error->ngtier = 4;
1636         } else if (HAS_PCH_SPLIT(dev_priv)) {
1637                 error->ier = I915_READ(DEIER);
1638                 error->gtier[0] = I915_READ(GTIER);
1639                 error->ngtier = 1;
1640         } else if (IS_GEN(dev_priv, 2)) {
1641                 error->ier = I915_READ16(GEN2_IER);
1642         } else if (!IS_VALLEYVIEW(dev_priv)) {
1643                 error->ier = I915_READ(GEN2_IER);
1644         }
1645         error->eir = I915_READ(EIR);
1646         error->pgtbl_er = I915_READ(PGTBL_ER);
1647 }
1648
1649 static const char *
1650 error_msg(struct i915_gpu_state *error,
1651           intel_engine_mask_t engines, const char *msg)
1652 {
1653         int len;
1654         int i;
1655
1656         for (i = 0; i < ARRAY_SIZE(error->engine); i++)
1657                 if (!error->engine[i].context.pid)
1658                         engines &= ~BIT(i);
1659
1660         len = scnprintf(error->error_msg, sizeof(error->error_msg),
1661                         "GPU HANG: ecode %d:%x:0x%08x",
1662                         INTEL_GEN(error->i915), engines,
1663                         i915_error_generate_code(error, engines));
1664         if (engines) {
1665                 /* Just show the first executing process, more is confusing */
1666                 i = __ffs(engines);
1667                 len += scnprintf(error->error_msg + len,
1668                                  sizeof(error->error_msg) - len,
1669                                  ", in %s [%d]",
1670                                  error->engine[i].context.comm,
1671                                  error->engine[i].context.pid);
1672         }
1673         if (msg)
1674                 len += scnprintf(error->error_msg + len,
1675                                  sizeof(error->error_msg) - len,
1676                                  ", %s", msg);
1677
1678         return error->error_msg;
1679 }
1680
1681 static void capture_gen_state(struct i915_gpu_state *error)
1682 {
1683         struct drm_i915_private *i915 = error->i915;
1684
1685         error->awake = i915->gt.awake;
1686         error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1687         error->suspended = i915->runtime_pm.suspended;
1688
1689         error->iommu = -1;
1690 #ifdef CONFIG_INTEL_IOMMU
1691         error->iommu = intel_iommu_gfx_mapped;
1692 #endif
1693         error->reset_count = i915_reset_count(&i915->gpu_error);
1694         error->suspend_count = i915->suspend_count;
1695
1696         memcpy(&error->device_info,
1697                INTEL_INFO(i915),
1698                sizeof(error->device_info));
1699         memcpy(&error->runtime_info,
1700                RUNTIME_INFO(i915),
1701                sizeof(error->runtime_info));
1702         error->driver_caps = i915->caps;
1703 }
1704
1705 static void capture_params(struct i915_gpu_state *error)
1706 {
1707         i915_params_copy(&error->params, &i915_modparams);
1708 }
1709
1710 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1711 {
1712         unsigned long epoch = error->capture;
1713         int i;
1714
1715         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1716                 const struct drm_i915_error_engine *ee = &error->engine[i];
1717
1718                 if (ee->hangcheck_timestamp &&
1719                     time_before(ee->hangcheck_timestamp, epoch))
1720                         epoch = ee->hangcheck_timestamp;
1721         }
1722
1723         return epoch;
1724 }
1725
1726 static void capture_finish(struct i915_gpu_state *error)
1727 {
1728         struct i915_ggtt *ggtt = &error->i915->ggtt;
1729         const u64 slot = ggtt->error_capture.start;
1730
1731         ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1732 }
1733
1734 static int capture(void *data)
1735 {
1736         struct i915_gpu_state *error = data;
1737
1738         error->time = ktime_get_real();
1739         error->boottime = ktime_get_boottime();
1740         error->uptime = ktime_sub(ktime_get(),
1741                                   error->i915->gt.last_init_time);
1742         error->capture = jiffies;
1743
1744         capture_params(error);
1745         capture_gen_state(error);
1746         capture_uc_state(error);
1747         capture_reg_state(error);
1748         gem_record_fences(error);
1749         gem_record_rings(error);
1750         capture_active_buffers(error);
1751         capture_pinned_buffers(error);
1752
1753         error->overlay = intel_overlay_capture_error_state(error->i915);
1754         error->display = intel_display_capture_error_state(error->i915);
1755
1756         error->epoch = capture_find_epoch(error);
1757
1758         capture_finish(error);
1759         return 0;
1760 }
1761
1762 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1763
1764 struct i915_gpu_state *
1765 i915_capture_gpu_state(struct drm_i915_private *i915)
1766 {
1767         struct i915_gpu_state *error;
1768
1769         /* Check if GPU capture has been disabled */
1770         error = READ_ONCE(i915->gpu_error.first_error);
1771         if (IS_ERR(error))
1772                 return error;
1773
1774         error = kzalloc(sizeof(*error), GFP_ATOMIC);
1775         if (!error) {
1776                 i915_disable_error_state(i915, -ENOMEM);
1777                 return ERR_PTR(-ENOMEM);
1778         }
1779
1780         kref_init(&error->ref);
1781         error->i915 = i915;
1782
1783         stop_machine(capture, error, NULL);
1784
1785         return error;
1786 }
1787
1788 /**
1789  * i915_capture_error_state - capture an error record for later analysis
1790  * @i915: i915 device
1791  * @engine_mask: the mask of engines triggering the hang
1792  * @msg: a message to insert into the error capture header
1793  *
1794  * Should be called when an error is detected (either a hang or an error
1795  * interrupt) to capture error state from the time of the error.  Fills
1796  * out a structure which becomes available in debugfs for user level tools
1797  * to pick up.
1798  */
1799 void i915_capture_error_state(struct drm_i915_private *i915,
1800                               intel_engine_mask_t engine_mask,
1801                               const char *msg)
1802 {
1803         static bool warned;
1804         struct i915_gpu_state *error;
1805         unsigned long flags;
1806
1807         if (!i915_modparams.error_capture)
1808                 return;
1809
1810         if (READ_ONCE(i915->gpu_error.first_error))
1811                 return;
1812
1813         error = i915_capture_gpu_state(i915);
1814         if (IS_ERR(error))
1815                 return;
1816
1817         dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
1818
1819         if (!error->simulated) {
1820                 spin_lock_irqsave(&i915->gpu_error.lock, flags);
1821                 if (!i915->gpu_error.first_error) {
1822                         i915->gpu_error.first_error = error;
1823                         error = NULL;
1824                 }
1825                 spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1826         }
1827
1828         if (error) {
1829                 __i915_gpu_state_free(&error->ref);
1830                 return;
1831         }
1832
1833         if (!warned &&
1834             ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1835                 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1836                 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1837                 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1838                 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1839                 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1840                          i915->drm.primary->index);
1841                 warned = true;
1842         }
1843 }
1844
1845 struct i915_gpu_state *
1846 i915_first_error_state(struct drm_i915_private *i915)
1847 {
1848         struct i915_gpu_state *error;
1849
1850         spin_lock_irq(&i915->gpu_error.lock);
1851         error = i915->gpu_error.first_error;
1852         if (!IS_ERR_OR_NULL(error))
1853                 i915_gpu_state_get(error);
1854         spin_unlock_irq(&i915->gpu_error.lock);
1855
1856         return error;
1857 }
1858
1859 void i915_reset_error_state(struct drm_i915_private *i915)
1860 {
1861         struct i915_gpu_state *error;
1862
1863         spin_lock_irq(&i915->gpu_error.lock);
1864         error = i915->gpu_error.first_error;
1865         if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1866                 i915->gpu_error.first_error = NULL;
1867         spin_unlock_irq(&i915->gpu_error.lock);
1868
1869         if (!IS_ERR_OR_NULL(error))
1870                 i915_gpu_state_put(error);
1871 }
1872
1873 void i915_disable_error_state(struct drm_i915_private *i915, int err)
1874 {
1875         spin_lock_irq(&i915->gpu_error.lock);
1876         if (!i915->gpu_error.first_error)
1877                 i915->gpu_error.first_error = ERR_PTR(err);
1878         spin_unlock_irq(&i915->gpu_error.lock);
1879 }