26c9c0595bdf208cf2e21d200a4247609362155f
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / i915_gpu_error.c
1 /*
2  * Copyright (c) 2008 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *    Keith Packard <keithp@keithp.com>
26  *    Mika Kuoppala <mika.kuoppala@intel.com>
27  *
28  */
29
30 #include <linux/ascii85.h>
31 #include <linux/nmi.h>
32 #include <linux/scatterlist.h>
33 #include <linux/stop_machine.h>
34 #include <linux/utsname.h>
35 #include <linux/zlib.h>
36
37 #include <drm/drm_print.h>
38
39 #include "gem/i915_gem_context.h"
40
41 #include "i915_drv.h"
42 #include "i915_gpu_error.h"
43 #include "i915_scatterlist.h"
44 #include "intel_atomic.h"
45 #include "intel_csr.h"
46 #include "intel_overlay.h"
47
48 static inline const struct intel_engine_cs *
49 engine_lookup(const struct drm_i915_private *i915, unsigned int id)
50 {
51         if (id >= I915_NUM_ENGINES)
52                 return NULL;
53
54         return i915->engine[id];
55 }
56
57 static inline const char *
58 __engine_name(const struct intel_engine_cs *engine)
59 {
60         return engine ? engine->name : "";
61 }
62
63 static const char *
64 engine_name(const struct drm_i915_private *i915, unsigned int id)
65 {
66         return __engine_name(engine_lookup(i915, id));
67 }
68
69 static const char *tiling_flag(int tiling)
70 {
71         switch (tiling) {
72         default:
73         case I915_TILING_NONE: return "";
74         case I915_TILING_X: return " X";
75         case I915_TILING_Y: return " Y";
76         }
77 }
78
79 static const char *dirty_flag(int dirty)
80 {
81         return dirty ? " dirty" : "";
82 }
83
84 static const char *purgeable_flag(int purgeable)
85 {
86         return purgeable ? " purgeable" : "";
87 }
88
89 static void __sg_set_buf(struct scatterlist *sg,
90                          void *addr, unsigned int len, loff_t it)
91 {
92         sg->page_link = (unsigned long)virt_to_page(addr);
93         sg->offset = offset_in_page(addr);
94         sg->length = len;
95         sg->dma_address = it;
96 }
97
98 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
99 {
100         if (!len)
101                 return false;
102
103         if (e->bytes + len + 1 <= e->size)
104                 return true;
105
106         if (e->bytes) {
107                 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
108                 e->iter += e->bytes;
109                 e->buf = NULL;
110                 e->bytes = 0;
111         }
112
113         if (e->cur == e->end) {
114                 struct scatterlist *sgl;
115
116                 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
117                 if (!sgl) {
118                         e->err = -ENOMEM;
119                         return false;
120                 }
121
122                 if (e->cur) {
123                         e->cur->offset = 0;
124                         e->cur->length = 0;
125                         e->cur->page_link =
126                                 (unsigned long)sgl | SG_CHAIN;
127                 } else {
128                         e->sgl = sgl;
129                 }
130
131                 e->cur = sgl;
132                 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
133         }
134
135         e->size = ALIGN(len + 1, SZ_64K);
136         e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
137         if (!e->buf) {
138                 e->size = PAGE_ALIGN(len + 1);
139                 e->buf = kmalloc(e->size, GFP_KERNEL);
140         }
141         if (!e->buf) {
142                 e->err = -ENOMEM;
143                 return false;
144         }
145
146         return true;
147 }
148
149 __printf(2, 0)
150 static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
151                                const char *fmt, va_list args)
152 {
153         va_list ap;
154         int len;
155
156         if (e->err)
157                 return;
158
159         va_copy(ap, args);
160         len = vsnprintf(NULL, 0, fmt, ap);
161         va_end(ap);
162         if (len <= 0) {
163                 e->err = len;
164                 return;
165         }
166
167         if (!__i915_error_grow(e, len))
168                 return;
169
170         GEM_BUG_ON(e->bytes >= e->size);
171         len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
172         if (len < 0) {
173                 e->err = len;
174                 return;
175         }
176         e->bytes += len;
177 }
178
179 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
180 {
181         unsigned len;
182
183         if (e->err || !str)
184                 return;
185
186         len = strlen(str);
187         if (!__i915_error_grow(e, len))
188                 return;
189
190         GEM_BUG_ON(e->bytes + len > e->size);
191         memcpy(e->buf + e->bytes, str, len);
192         e->bytes += len;
193 }
194
195 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
196 #define err_puts(e, s) i915_error_puts(e, s)
197
198 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
199 {
200         i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
201 }
202
203 static inline struct drm_printer
204 i915_error_printer(struct drm_i915_error_state_buf *e)
205 {
206         struct drm_printer p = {
207                 .printfn = __i915_printfn_error,
208                 .arg = e,
209         };
210         return p;
211 }
212
213 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
214
215 struct compress {
216         struct z_stream_s zstream;
217         void *tmp;
218 };
219
220 static bool compress_init(struct compress *c)
221 {
222         struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
223
224         zstream->workspace =
225                 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
226                         GFP_ATOMIC | __GFP_NOWARN);
227         if (!zstream->workspace)
228                 return false;
229
230         if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
231                 kfree(zstream->workspace);
232                 return false;
233         }
234
235         c->tmp = NULL;
236         if (i915_has_memcpy_from_wc())
237                 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
238
239         return true;
240 }
241
242 static void *compress_next_page(struct drm_i915_error_object *dst)
243 {
244         unsigned long page;
245
246         if (dst->page_count >= dst->num_pages)
247                 return ERR_PTR(-ENOSPC);
248
249         page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
250         if (!page)
251                 return ERR_PTR(-ENOMEM);
252
253         return dst->pages[dst->page_count++] = (void *)page;
254 }
255
256 static int compress_page(struct compress *c,
257                          void *src,
258                          struct drm_i915_error_object *dst)
259 {
260         struct z_stream_s *zstream = &c->zstream;
261
262         zstream->next_in = src;
263         if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
264                 zstream->next_in = c->tmp;
265         zstream->avail_in = PAGE_SIZE;
266
267         do {
268                 if (zstream->avail_out == 0) {
269                         zstream->next_out = compress_next_page(dst);
270                         if (IS_ERR(zstream->next_out))
271                                 return PTR_ERR(zstream->next_out);
272
273                         zstream->avail_out = PAGE_SIZE;
274                 }
275
276                 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
277                         return -EIO;
278
279                 touch_nmi_watchdog();
280         } while (zstream->avail_in);
281
282         /* Fallback to uncompressed if we increase size? */
283         if (0 && zstream->total_out > zstream->total_in)
284                 return -E2BIG;
285
286         return 0;
287 }
288
289 static int compress_flush(struct compress *c,
290                           struct drm_i915_error_object *dst)
291 {
292         struct z_stream_s *zstream = &c->zstream;
293
294         do {
295                 switch (zlib_deflate(zstream, Z_FINISH)) {
296                 case Z_OK: /* more space requested */
297                         zstream->next_out = compress_next_page(dst);
298                         if (IS_ERR(zstream->next_out))
299                                 return PTR_ERR(zstream->next_out);
300
301                         zstream->avail_out = PAGE_SIZE;
302                         break;
303
304                 case Z_STREAM_END:
305                         goto end;
306
307                 default: /* any error */
308                         return -EIO;
309                 }
310         } while (1);
311
312 end:
313         memset(zstream->next_out, 0, zstream->avail_out);
314         dst->unused = zstream->avail_out;
315         return 0;
316 }
317
318 static void compress_fini(struct compress *c,
319                           struct drm_i915_error_object *dst)
320 {
321         struct z_stream_s *zstream = &c->zstream;
322
323         zlib_deflateEnd(zstream);
324         kfree(zstream->workspace);
325         if (c->tmp)
326                 free_page((unsigned long)c->tmp);
327 }
328
329 static void err_compression_marker(struct drm_i915_error_state_buf *m)
330 {
331         err_puts(m, ":");
332 }
333
334 #else
335
336 struct compress {
337 };
338
339 static bool compress_init(struct compress *c)
340 {
341         return true;
342 }
343
344 static int compress_page(struct compress *c,
345                          void *src,
346                          struct drm_i915_error_object *dst)
347 {
348         unsigned long page;
349         void *ptr;
350
351         page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
352         if (!page)
353                 return -ENOMEM;
354
355         ptr = (void *)page;
356         if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
357                 memcpy(ptr, src, PAGE_SIZE);
358         dst->pages[dst->page_count++] = ptr;
359
360         return 0;
361 }
362
363 static int compress_flush(struct compress *c,
364                           struct drm_i915_error_object *dst)
365 {
366         return 0;
367 }
368
369 static void compress_fini(struct compress *c,
370                           struct drm_i915_error_object *dst)
371 {
372 }
373
374 static void err_compression_marker(struct drm_i915_error_state_buf *m)
375 {
376         err_puts(m, "~");
377 }
378
379 #endif
380
381 static void print_error_buffers(struct drm_i915_error_state_buf *m,
382                                 const char *name,
383                                 struct drm_i915_error_buffer *err,
384                                 int count)
385 {
386         err_printf(m, "%s [%d]:\n", name, count);
387
388         while (count--) {
389                 err_printf(m, "    %08x_%08x %8u %02x %02x",
390                            upper_32_bits(err->gtt_offset),
391                            lower_32_bits(err->gtt_offset),
392                            err->size,
393                            err->read_domains,
394                            err->write_domain);
395                 err_puts(m, tiling_flag(err->tiling));
396                 err_puts(m, dirty_flag(err->dirty));
397                 err_puts(m, purgeable_flag(err->purgeable));
398                 err_puts(m, err->userptr ? " userptr" : "");
399                 err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
400
401                 if (err->name)
402                         err_printf(m, " (name: %d)", err->name);
403                 if (err->fence_reg != I915_FENCE_REG_NONE)
404                         err_printf(m, " (fence: %d)", err->fence_reg);
405
406                 err_puts(m, "\n");
407                 err++;
408         }
409 }
410
411 static void error_print_instdone(struct drm_i915_error_state_buf *m,
412                                  const struct drm_i915_error_engine *ee)
413 {
414         int slice;
415         int subslice;
416
417         err_printf(m, "  INSTDONE: 0x%08x\n",
418                    ee->instdone.instdone);
419
420         if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
421                 return;
422
423         err_printf(m, "  SC_INSTDONE: 0x%08x\n",
424                    ee->instdone.slice_common);
425
426         if (INTEL_GEN(m->i915) <= 6)
427                 return;
428
429         for_each_instdone_slice_subslice(m->i915, slice, subslice)
430                 err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
431                            slice, subslice,
432                            ee->instdone.sampler[slice][subslice]);
433
434         for_each_instdone_slice_subslice(m->i915, slice, subslice)
435                 err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
436                            slice, subslice,
437                            ee->instdone.row[slice][subslice]);
438 }
439
440 static void error_print_request(struct drm_i915_error_state_buf *m,
441                                 const char *prefix,
442                                 const struct drm_i915_error_request *erq,
443                                 const unsigned long epoch)
444 {
445         if (!erq->seqno)
446                 return;
447
448         err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
449                    prefix, erq->pid, erq->context, erq->seqno,
450                    test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
451                             &erq->flags) ? "!" : "",
452                    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
453                             &erq->flags) ? "+" : "",
454                    erq->sched_attr.priority,
455                    jiffies_to_msecs(erq->jiffies - epoch),
456                    erq->start, erq->head, erq->tail);
457 }
458
459 static void error_print_context(struct drm_i915_error_state_buf *m,
460                                 const char *header,
461                                 const struct drm_i915_error_context *ctx)
462 {
463         err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n",
464                    header, ctx->comm, ctx->pid, ctx->hw_id,
465                    ctx->sched_attr.priority, ctx->guilty, ctx->active);
466 }
467
468 static void error_print_engine(struct drm_i915_error_state_buf *m,
469                                const struct drm_i915_error_engine *ee,
470                                const unsigned long epoch)
471 {
472         int n;
473
474         err_printf(m, "%s command stream:\n",
475                    engine_name(m->i915, ee->engine_id));
476         err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
477         err_printf(m, "  START: 0x%08x\n", ee->start);
478         err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
479         err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
480                    ee->tail, ee->rq_post, ee->rq_tail);
481         err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
482         err_printf(m, "  MODE:  0x%08x\n", ee->mode);
483         err_printf(m, "  HWS:   0x%08x\n", ee->hws);
484         err_printf(m, "  ACTHD: 0x%08x %08x\n",
485                    (u32)(ee->acthd>>32), (u32)ee->acthd);
486         err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
487         err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
488
489         error_print_instdone(m, ee);
490
491         if (ee->batchbuffer) {
492                 u64 start = ee->batchbuffer->gtt_offset;
493                 u64 end = start + ee->batchbuffer->gtt_size;
494
495                 err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
496                            upper_32_bits(start), lower_32_bits(start),
497                            upper_32_bits(end), lower_32_bits(end));
498         }
499         if (INTEL_GEN(m->i915) >= 4) {
500                 err_printf(m, "  BBADDR: 0x%08x_%08x\n",
501                            (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
502                 err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
503                 err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
504         }
505         err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
506         err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
507                    lower_32_bits(ee->faddr));
508         if (INTEL_GEN(m->i915) >= 6) {
509                 err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
510                 err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
511         }
512         if (HAS_PPGTT(m->i915)) {
513                 err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
514
515                 if (INTEL_GEN(m->i915) >= 8) {
516                         int i;
517                         for (i = 0; i < 4; i++)
518                                 err_printf(m, "  PDP%d: 0x%016llx\n",
519                                            i, ee->vm_info.pdp[i]);
520                 } else {
521                         err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
522                                    ee->vm_info.pp_dir_base);
523                 }
524         }
525         err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
526         err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
527         err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
528                    jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
529                    ee->hangcheck_timestamp,
530                    ee->hangcheck_timestamp == epoch ? "; epoch" : "");
531         err_printf(m, "  engine reset count: %u\n", ee->reset_count);
532
533         for (n = 0; n < ee->num_ports; n++) {
534                 err_printf(m, "  ELSP[%d]:", n);
535                 error_print_request(m, " ", &ee->execlist[n], epoch);
536         }
537
538         error_print_context(m, "  Active context: ", &ee->context);
539 }
540
541 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
542 {
543         va_list args;
544
545         va_start(args, f);
546         i915_error_vprintf(e, f, args);
547         va_end(args);
548 }
549
550 static void print_error_obj(struct drm_i915_error_state_buf *m,
551                             struct intel_engine_cs *engine,
552                             const char *name,
553                             struct drm_i915_error_object *obj)
554 {
555         char out[ASCII85_BUFSZ];
556         int page;
557
558         if (!obj)
559                 return;
560
561         if (name) {
562                 err_printf(m, "%s --- %s = 0x%08x %08x\n",
563                            engine ? engine->name : "global", name,
564                            upper_32_bits(obj->gtt_offset),
565                            lower_32_bits(obj->gtt_offset));
566         }
567
568         err_compression_marker(m);
569         for (page = 0; page < obj->page_count; page++) {
570                 int i, len;
571
572                 len = PAGE_SIZE;
573                 if (page == obj->page_count - 1)
574                         len -= obj->unused;
575                 len = ascii85_encode_len(len);
576
577                 for (i = 0; i < len; i++)
578                         err_puts(m, ascii85_encode(obj->pages[page][i], out));
579         }
580         err_puts(m, "\n");
581 }
582
583 static void err_print_capabilities(struct drm_i915_error_state_buf *m,
584                                    const struct intel_device_info *info,
585                                    const struct intel_runtime_info *runtime,
586                                    const struct intel_driver_caps *caps)
587 {
588         struct drm_printer p = i915_error_printer(m);
589
590         intel_device_info_dump_flags(info, &p);
591         intel_driver_caps_print(caps, &p);
592         intel_device_info_dump_topology(&runtime->sseu, &p);
593 }
594
595 static void err_print_params(struct drm_i915_error_state_buf *m,
596                              const struct i915_params *params)
597 {
598         struct drm_printer p = i915_error_printer(m);
599
600         i915_params_dump(params, &p);
601 }
602
603 static void err_print_pciid(struct drm_i915_error_state_buf *m,
604                             struct drm_i915_private *i915)
605 {
606         struct pci_dev *pdev = i915->drm.pdev;
607
608         err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
609         err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
610         err_printf(m, "PCI Subsystem: %04x:%04x\n",
611                    pdev->subsystem_vendor,
612                    pdev->subsystem_device);
613 }
614
615 static void err_print_uc(struct drm_i915_error_state_buf *m,
616                          const struct i915_error_uc *error_uc)
617 {
618         struct drm_printer p = i915_error_printer(m);
619         const struct i915_gpu_state *error =
620                 container_of(error_uc, typeof(*error), uc);
621
622         if (!error->device_info.has_guc)
623                 return;
624
625         intel_uc_fw_dump(&error_uc->guc_fw, &p);
626         intel_uc_fw_dump(&error_uc->huc_fw, &p);
627         print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
628 }
629
630 static void err_free_sgl(struct scatterlist *sgl)
631 {
632         while (sgl) {
633                 struct scatterlist *sg;
634
635                 for (sg = sgl; !sg_is_chain(sg); sg++) {
636                         kfree(sg_virt(sg));
637                         if (sg_is_last(sg))
638                                 break;
639                 }
640
641                 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
642                 free_page((unsigned long)sgl);
643                 sgl = sg;
644         }
645 }
646
647 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
648                                struct i915_gpu_state *error)
649 {
650         struct drm_i915_error_object *obj;
651         struct timespec64 ts;
652         int i, j;
653
654         if (*error->error_msg)
655                 err_printf(m, "%s\n", error->error_msg);
656         err_printf(m, "Kernel: %s %s\n",
657                    init_utsname()->release,
658                    init_utsname()->machine);
659         ts = ktime_to_timespec64(error->time);
660         err_printf(m, "Time: %lld s %ld us\n",
661                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
662         ts = ktime_to_timespec64(error->boottime);
663         err_printf(m, "Boottime: %lld s %ld us\n",
664                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
665         ts = ktime_to_timespec64(error->uptime);
666         err_printf(m, "Uptime: %lld s %ld us\n",
667                    (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
668         err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
669         err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
670                    error->capture,
671                    jiffies_to_msecs(jiffies - error->capture),
672                    jiffies_to_msecs(error->capture - error->epoch));
673
674         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
675                 if (!error->engine[i].context.pid)
676                         continue;
677
678                 err_printf(m, "Active process (on ring %s): %s [%d]\n",
679                            engine_name(m->i915, i),
680                            error->engine[i].context.comm,
681                            error->engine[i].context.pid);
682         }
683         err_printf(m, "Reset count: %u\n", error->reset_count);
684         err_printf(m, "Suspend count: %u\n", error->suspend_count);
685         err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
686         err_printf(m, "Subplatform: 0x%x\n",
687                    intel_subplatform(&error->runtime_info,
688                                      error->device_info.platform));
689         err_print_pciid(m, m->i915);
690
691         err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
692
693         if (HAS_CSR(m->i915)) {
694                 struct intel_csr *csr = &m->i915->csr;
695
696                 err_printf(m, "DMC loaded: %s\n",
697                            yesno(csr->dmc_payload != NULL));
698                 err_printf(m, "DMC fw version: %d.%d\n",
699                            CSR_VERSION_MAJOR(csr->version),
700                            CSR_VERSION_MINOR(csr->version));
701         }
702
703         err_printf(m, "GT awake: %s\n", yesno(error->awake));
704         err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
705         err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
706         err_printf(m, "EIR: 0x%08x\n", error->eir);
707         err_printf(m, "IER: 0x%08x\n", error->ier);
708         for (i = 0; i < error->ngtier; i++)
709                 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
710         err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
711         err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
712         err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
713         err_printf(m, "CCID: 0x%08x\n", error->ccid);
714
715         for (i = 0; i < error->nfence; i++)
716                 err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
717
718         if (INTEL_GEN(m->i915) >= 6) {
719                 err_printf(m, "ERROR: 0x%08x\n", error->error);
720
721                 if (INTEL_GEN(m->i915) >= 8)
722                         err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
723                                    error->fault_data1, error->fault_data0);
724
725                 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
726         }
727
728         if (IS_GEN(m->i915, 7))
729                 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
730
731         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
732                 if (error->engine[i].engine_id != -1)
733                         error_print_engine(m, &error->engine[i], error->epoch);
734         }
735
736         for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
737                 char buf[128];
738                 int len, first = 1;
739
740                 if (!error->active_vm[i])
741                         break;
742
743                 len = scnprintf(buf, sizeof(buf), "Active (");
744                 for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
745                         if (error->engine[j].vm != error->active_vm[i])
746                                 continue;
747
748                         len += scnprintf(buf + len, sizeof(buf), "%s%s",
749                                          first ? "" : ", ",
750                                          m->i915->engine[j]->name);
751                         first = 0;
752                 }
753                 scnprintf(buf + len, sizeof(buf), ")");
754                 print_error_buffers(m, buf,
755                                     error->active_bo[i],
756                                     error->active_bo_count[i]);
757         }
758
759         print_error_buffers(m, "Pinned (global)",
760                             error->pinned_bo,
761                             error->pinned_bo_count);
762
763         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
764                 const struct drm_i915_error_engine *ee = &error->engine[i];
765
766                 obj = ee->batchbuffer;
767                 if (obj) {
768                         err_puts(m, m->i915->engine[i]->name);
769                         if (ee->context.pid)
770                                 err_printf(m, " (submitted by %s [%d])",
771                                            ee->context.comm,
772                                            ee->context.pid);
773                         err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
774                                    upper_32_bits(obj->gtt_offset),
775                                    lower_32_bits(obj->gtt_offset));
776                         print_error_obj(m, m->i915->engine[i], NULL, obj);
777                 }
778
779                 for (j = 0; j < ee->user_bo_count; j++)
780                         print_error_obj(m, m->i915->engine[i],
781                                         "user", ee->user_bo[j]);
782
783                 if (ee->num_requests) {
784                         err_printf(m, "%s --- %d requests\n",
785                                    m->i915->engine[i]->name,
786                                    ee->num_requests);
787                         for (j = 0; j < ee->num_requests; j++)
788                                 error_print_request(m, " ",
789                                                     &ee->requests[j],
790                                                     error->epoch);
791                 }
792
793                 print_error_obj(m, m->i915->engine[i],
794                                 "ringbuffer", ee->ringbuffer);
795
796                 print_error_obj(m, m->i915->engine[i],
797                                 "HW Status", ee->hws_page);
798
799                 print_error_obj(m, m->i915->engine[i],
800                                 "HW context", ee->ctx);
801
802                 print_error_obj(m, m->i915->engine[i],
803                                 "WA context", ee->wa_ctx);
804
805                 print_error_obj(m, m->i915->engine[i],
806                                 "WA batchbuffer", ee->wa_batchbuffer);
807
808                 print_error_obj(m, m->i915->engine[i],
809                                 "NULL context", ee->default_state);
810         }
811
812         if (error->overlay)
813                 intel_overlay_print_error_state(m, error->overlay);
814
815         if (error->display)
816                 intel_display_print_error_state(m, error->display);
817
818         err_print_capabilities(m, &error->device_info, &error->runtime_info,
819                                &error->driver_caps);
820         err_print_params(m, &error->params);
821         err_print_uc(m, &error->uc);
822 }
823
824 static int err_print_to_sgl(struct i915_gpu_state *error)
825 {
826         struct drm_i915_error_state_buf m;
827
828         if (IS_ERR(error))
829                 return PTR_ERR(error);
830
831         if (READ_ONCE(error->sgl))
832                 return 0;
833
834         memset(&m, 0, sizeof(m));
835         m.i915 = error->i915;
836
837         __err_print_to_sgl(&m, error);
838
839         if (m.buf) {
840                 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
841                 m.bytes = 0;
842                 m.buf = NULL;
843         }
844         if (m.cur) {
845                 GEM_BUG_ON(m.end < m.cur);
846                 sg_mark_end(m.cur - 1);
847         }
848         GEM_BUG_ON(m.sgl && !m.cur);
849
850         if (m.err) {
851                 err_free_sgl(m.sgl);
852                 return m.err;
853         }
854
855         if (cmpxchg(&error->sgl, NULL, m.sgl))
856                 err_free_sgl(m.sgl);
857
858         return 0;
859 }
860
861 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
862                                       char *buf, loff_t off, size_t rem)
863 {
864         struct scatterlist *sg;
865         size_t count;
866         loff_t pos;
867         int err;
868
869         if (!error || !rem)
870                 return 0;
871
872         err = err_print_to_sgl(error);
873         if (err)
874                 return err;
875
876         sg = READ_ONCE(error->fit);
877         if (!sg || off < sg->dma_address)
878                 sg = error->sgl;
879         if (!sg)
880                 return 0;
881
882         pos = sg->dma_address;
883         count = 0;
884         do {
885                 size_t len, start;
886
887                 if (sg_is_chain(sg)) {
888                         sg = sg_chain_ptr(sg);
889                         GEM_BUG_ON(sg_is_chain(sg));
890                 }
891
892                 len = sg->length;
893                 if (pos + len <= off) {
894                         pos += len;
895                         continue;
896                 }
897
898                 start = sg->offset;
899                 if (pos < off) {
900                         GEM_BUG_ON(off - pos > len);
901                         len -= off - pos;
902                         start += off - pos;
903                         pos = off;
904                 }
905
906                 len = min(len, rem);
907                 GEM_BUG_ON(!len || len > sg->length);
908
909                 memcpy(buf, page_address(sg_page(sg)) + start, len);
910
911                 count += len;
912                 pos += len;
913
914                 buf += len;
915                 rem -= len;
916                 if (!rem) {
917                         WRITE_ONCE(error->fit, sg);
918                         break;
919                 }
920         } while (!sg_is_last(sg++));
921
922         return count;
923 }
924
925 static void i915_error_object_free(struct drm_i915_error_object *obj)
926 {
927         int page;
928
929         if (obj == NULL)
930                 return;
931
932         for (page = 0; page < obj->page_count; page++)
933                 free_page((unsigned long)obj->pages[page]);
934
935         kfree(obj);
936 }
937
938
939 static void cleanup_params(struct i915_gpu_state *error)
940 {
941         i915_params_free(&error->params);
942 }
943
944 static void cleanup_uc_state(struct i915_gpu_state *error)
945 {
946         struct i915_error_uc *error_uc = &error->uc;
947
948         kfree(error_uc->guc_fw.path);
949         kfree(error_uc->huc_fw.path);
950         i915_error_object_free(error_uc->guc_log);
951 }
952
953 void __i915_gpu_state_free(struct kref *error_ref)
954 {
955         struct i915_gpu_state *error =
956                 container_of(error_ref, typeof(*error), ref);
957         long i, j;
958
959         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
960                 struct drm_i915_error_engine *ee = &error->engine[i];
961
962                 for (j = 0; j < ee->user_bo_count; j++)
963                         i915_error_object_free(ee->user_bo[j]);
964                 kfree(ee->user_bo);
965
966                 i915_error_object_free(ee->batchbuffer);
967                 i915_error_object_free(ee->wa_batchbuffer);
968                 i915_error_object_free(ee->ringbuffer);
969                 i915_error_object_free(ee->hws_page);
970                 i915_error_object_free(ee->ctx);
971                 i915_error_object_free(ee->wa_ctx);
972
973                 kfree(ee->requests);
974         }
975
976         for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
977                 kfree(error->active_bo[i]);
978         kfree(error->pinned_bo);
979
980         kfree(error->overlay);
981         kfree(error->display);
982
983         cleanup_params(error);
984         cleanup_uc_state(error);
985
986         err_free_sgl(error->sgl);
987         kfree(error);
988 }
989
990 static struct drm_i915_error_object *
991 i915_error_object_create(struct drm_i915_private *i915,
992                          struct i915_vma *vma)
993 {
994         struct i915_ggtt *ggtt = &i915->ggtt;
995         const u64 slot = ggtt->error_capture.start;
996         struct drm_i915_error_object *dst;
997         struct compress compress;
998         unsigned long num_pages;
999         struct sgt_iter iter;
1000         dma_addr_t dma;
1001         int ret;
1002
1003         if (!vma || !vma->pages)
1004                 return NULL;
1005
1006         num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1007         num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1008         dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
1009                       GFP_ATOMIC | __GFP_NOWARN);
1010         if (!dst)
1011                 return NULL;
1012
1013         dst->gtt_offset = vma->node.start;
1014         dst->gtt_size = vma->node.size;
1015         dst->num_pages = num_pages;
1016         dst->page_count = 0;
1017         dst->unused = 0;
1018
1019         if (!compress_init(&compress)) {
1020                 kfree(dst);
1021                 return NULL;
1022         }
1023
1024         ret = -EINVAL;
1025         for_each_sgt_dma(dma, iter, vma->pages) {
1026                 void __iomem *s;
1027
1028                 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1029
1030                 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1031                 ret = compress_page(&compress, (void  __force *)s, dst);
1032                 io_mapping_unmap_atomic(s);
1033                 if (ret)
1034                         break;
1035         }
1036
1037         if (ret || compress_flush(&compress, dst)) {
1038                 while (dst->page_count--)
1039                         free_page((unsigned long)dst->pages[dst->page_count]);
1040                 kfree(dst);
1041                 dst = NULL;
1042         }
1043
1044         compress_fini(&compress, dst);
1045         return dst;
1046 }
1047
1048 static void capture_bo(struct drm_i915_error_buffer *err,
1049                        struct i915_vma *vma)
1050 {
1051         struct drm_i915_gem_object *obj = vma->obj;
1052
1053         err->size = obj->base.size;
1054         err->name = obj->base.name;
1055
1056         err->gtt_offset = vma->node.start;
1057         err->read_domains = obj->read_domains;
1058         err->write_domain = obj->write_domain;
1059         err->fence_reg = vma->fence ? vma->fence->id : -1;
1060         err->tiling = i915_gem_object_get_tiling(obj);
1061         err->dirty = obj->mm.dirty;
1062         err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1063         err->userptr = obj->userptr.mm != NULL;
1064         err->cache_level = obj->cache_level;
1065 }
1066
1067 static u32 capture_error_bo(struct drm_i915_error_buffer *err,
1068                             int count, struct list_head *head,
1069                             unsigned int flags)
1070 #define ACTIVE_ONLY BIT(0)
1071 #define PINNED_ONLY BIT(1)
1072 {
1073         struct i915_vma *vma;
1074         int i = 0;
1075
1076         list_for_each_entry(vma, head, vm_link) {
1077                 if (!vma->obj)
1078                         continue;
1079
1080                 if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
1081                         continue;
1082
1083                 if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
1084                         continue;
1085
1086                 capture_bo(err++, vma);
1087                 if (++i == count)
1088                         break;
1089         }
1090
1091         return i;
1092 }
1093
1094 /*
1095  * Generate a semi-unique error code. The code is not meant to have meaning, The
1096  * code's only purpose is to try to prevent false duplicated bug reports by
1097  * grossly estimating a GPU error state.
1098  *
1099  * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1100  * the hang if we could strip the GTT offset information from it.
1101  *
1102  * It's only a small step better than a random number in its current form.
1103  */
1104 static u32 i915_error_generate_code(struct i915_gpu_state *error,
1105                                     intel_engine_mask_t engine_mask)
1106 {
1107         /*
1108          * IPEHR would be an ideal way to detect errors, as it's the gross
1109          * measure of "the command that hung." However, has some very common
1110          * synchronization commands which almost always appear in the case
1111          * strictly a client bug. Use instdone to differentiate those some.
1112          */
1113         if (engine_mask) {
1114                 struct drm_i915_error_engine *ee =
1115                         &error->engine[ffs(engine_mask)];
1116
1117                 return ee->ipehr ^ ee->instdone.instdone;
1118         }
1119
1120         return 0;
1121 }
1122
1123 static void gem_record_fences(struct i915_gpu_state *error)
1124 {
1125         struct drm_i915_private *dev_priv = error->i915;
1126         struct intel_uncore *uncore = &dev_priv->uncore;
1127         int i;
1128
1129         if (INTEL_GEN(dev_priv) >= 6) {
1130                 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1131                         error->fence[i] =
1132                                 intel_uncore_read64(uncore,
1133                                                     FENCE_REG_GEN6_LO(i));
1134         } else if (INTEL_GEN(dev_priv) >= 4) {
1135                 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1136                         error->fence[i] =
1137                                 intel_uncore_read64(uncore,
1138                                                     FENCE_REG_965_LO(i));
1139         } else {
1140                 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1141                         error->fence[i] =
1142                                 intel_uncore_read(uncore, FENCE_REG(i));
1143         }
1144         error->nfence = i;
1145 }
1146
1147 static void error_record_engine_registers(struct i915_gpu_state *error,
1148                                           struct intel_engine_cs *engine,
1149                                           struct drm_i915_error_engine *ee)
1150 {
1151         struct drm_i915_private *dev_priv = engine->i915;
1152
1153         if (INTEL_GEN(dev_priv) >= 6) {
1154                 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1155                 if (INTEL_GEN(dev_priv) >= 8)
1156                         ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1157                 else
1158                         ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
1159         }
1160
1161         if (INTEL_GEN(dev_priv) >= 4) {
1162                 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1163                 ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1164                 ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1165                 ee->instps = ENGINE_READ(engine, RING_INSTPS);
1166                 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1167                 if (INTEL_GEN(dev_priv) >= 8) {
1168                         ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1169                         ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1170                 }
1171                 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1172         } else {
1173                 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1174                 ee->ipeir = ENGINE_READ(engine, IPEIR);
1175                 ee->ipehr = ENGINE_READ(engine, IPEHR);
1176         }
1177
1178         intel_engine_get_instdone(engine, &ee->instdone);
1179
1180         ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1181         ee->acthd = intel_engine_get_active_head(engine);
1182         ee->start = ENGINE_READ(engine, RING_START);
1183         ee->head = ENGINE_READ(engine, RING_HEAD);
1184         ee->tail = ENGINE_READ(engine, RING_TAIL);
1185         ee->ctl = ENGINE_READ(engine, RING_CTL);
1186         if (INTEL_GEN(dev_priv) > 2)
1187                 ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1188
1189         if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1190                 i915_reg_t mmio;
1191
1192                 if (IS_GEN(dev_priv, 7)) {
1193                         switch (engine->id) {
1194                         default:
1195                                 MISSING_CASE(engine->id);
1196                         case RCS0:
1197                                 mmio = RENDER_HWS_PGA_GEN7;
1198                                 break;
1199                         case BCS0:
1200                                 mmio = BLT_HWS_PGA_GEN7;
1201                                 break;
1202                         case VCS0:
1203                                 mmio = BSD_HWS_PGA_GEN7;
1204                                 break;
1205                         case VECS0:
1206                                 mmio = VEBOX_HWS_PGA_GEN7;
1207                                 break;
1208                         }
1209                 } else if (IS_GEN(engine->i915, 6)) {
1210                         mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1211                 } else {
1212                         /* XXX: gen8 returns to sanity */
1213                         mmio = RING_HWS_PGA(engine->mmio_base);
1214                 }
1215
1216                 ee->hws = I915_READ(mmio);
1217         }
1218
1219         ee->idle = intel_engine_is_idle(engine);
1220         if (!ee->idle)
1221                 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1222         ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1223                                                   engine);
1224
1225         if (HAS_PPGTT(dev_priv)) {
1226                 int i;
1227
1228                 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
1229
1230                 if (IS_GEN(dev_priv, 6)) {
1231                         ee->vm_info.pp_dir_base =
1232                                 ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1233                 } else if (IS_GEN(dev_priv, 7)) {
1234                         ee->vm_info.pp_dir_base =
1235                                 ENGINE_READ(engine, RING_PP_DIR_BASE);
1236                 } else if (INTEL_GEN(dev_priv) >= 8) {
1237                         u32 base = engine->mmio_base;
1238
1239                         for (i = 0; i < 4; i++) {
1240                                 ee->vm_info.pdp[i] =
1241                                         I915_READ(GEN8_RING_PDP_UDW(base, i));
1242                                 ee->vm_info.pdp[i] <<= 32;
1243                                 ee->vm_info.pdp[i] |=
1244                                         I915_READ(GEN8_RING_PDP_LDW(base, i));
1245                         }
1246                 }
1247         }
1248 }
1249
1250 static void record_request(struct i915_request *request,
1251                            struct drm_i915_error_request *erq)
1252 {
1253         struct i915_gem_context *ctx = request->gem_context;
1254
1255         erq->flags = request->fence.flags;
1256         erq->context = request->fence.context;
1257         erq->seqno = request->fence.seqno;
1258         erq->sched_attr = request->sched.attr;
1259         erq->jiffies = request->emitted_jiffies;
1260         erq->start = i915_ggtt_offset(request->ring->vma);
1261         erq->head = request->head;
1262         erq->tail = request->tail;
1263
1264         rcu_read_lock();
1265         erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1266         rcu_read_unlock();
1267 }
1268
1269 static void engine_record_requests(struct intel_engine_cs *engine,
1270                                    struct i915_request *first,
1271                                    struct drm_i915_error_engine *ee)
1272 {
1273         struct i915_request *request;
1274         int count;
1275
1276         count = 0;
1277         request = first;
1278         list_for_each_entry_from(request, &engine->timeline.requests, link)
1279                 count++;
1280         if (!count)
1281                 return;
1282
1283         ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
1284         if (!ee->requests)
1285                 return;
1286
1287         ee->num_requests = count;
1288
1289         count = 0;
1290         request = first;
1291         list_for_each_entry_from(request, &engine->timeline.requests, link) {
1292                 if (count >= ee->num_requests) {
1293                         /*
1294                          * If the ring request list was changed in
1295                          * between the point where the error request
1296                          * list was created and dimensioned and this
1297                          * point then just exit early to avoid crashes.
1298                          *
1299                          * We don't need to communicate that the
1300                          * request list changed state during error
1301                          * state capture and that the error state is
1302                          * slightly incorrect as a consequence since we
1303                          * are typically only interested in the request
1304                          * list state at the point of error state
1305                          * capture, not in any changes happening during
1306                          * the capture.
1307                          */
1308                         break;
1309                 }
1310
1311                 record_request(request, &ee->requests[count++]);
1312         }
1313         ee->num_requests = count;
1314 }
1315
1316 static void error_record_engine_execlists(struct intel_engine_cs *engine,
1317                                           struct drm_i915_error_engine *ee)
1318 {
1319         const struct intel_engine_execlists * const execlists = &engine->execlists;
1320         unsigned int n;
1321
1322         for (n = 0; n < execlists_num_ports(execlists); n++) {
1323                 struct i915_request *rq = port_request(&execlists->port[n]);
1324
1325                 if (!rq)
1326                         break;
1327
1328                 record_request(rq, &ee->execlist[n]);
1329         }
1330
1331         ee->num_ports = n;
1332 }
1333
1334 static void record_context(struct drm_i915_error_context *e,
1335                            struct i915_gem_context *ctx)
1336 {
1337         if (ctx->pid) {
1338                 struct task_struct *task;
1339
1340                 rcu_read_lock();
1341                 task = pid_task(ctx->pid, PIDTYPE_PID);
1342                 if (task) {
1343                         strcpy(e->comm, task->comm);
1344                         e->pid = task->pid;
1345                 }
1346                 rcu_read_unlock();
1347         }
1348
1349         e->hw_id = ctx->hw_id;
1350         e->sched_attr = ctx->sched;
1351         e->guilty = atomic_read(&ctx->guilty_count);
1352         e->active = atomic_read(&ctx->active_count);
1353 }
1354
1355 static void request_record_user_bo(struct i915_request *request,
1356                                    struct drm_i915_error_engine *ee)
1357 {
1358         struct i915_capture_list *c;
1359         struct drm_i915_error_object **bo;
1360         long count, max;
1361
1362         max = 0;
1363         for (c = request->capture_list; c; c = c->next)
1364                 max++;
1365         if (!max)
1366                 return;
1367
1368         bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1369         if (!bo) {
1370                 /* If we can't capture everything, try to capture something. */
1371                 max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1372                 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1373         }
1374         if (!bo)
1375                 return;
1376
1377         count = 0;
1378         for (c = request->capture_list; c; c = c->next) {
1379                 bo[count] = i915_error_object_create(request->i915, c->vma);
1380                 if (!bo[count])
1381                         break;
1382                 if (++count == max)
1383                         break;
1384         }
1385
1386         ee->user_bo = bo;
1387         ee->user_bo_count = count;
1388 }
1389
1390 static struct drm_i915_error_object *
1391 capture_object(struct drm_i915_private *dev_priv,
1392                struct drm_i915_gem_object *obj)
1393 {
1394         if (obj && i915_gem_object_has_pages(obj)) {
1395                 struct i915_vma fake = {
1396                         .node = { .start = U64_MAX, .size = obj->base.size },
1397                         .size = obj->base.size,
1398                         .pages = obj->mm.pages,
1399                         .obj = obj,
1400                 };
1401
1402                 return i915_error_object_create(dev_priv, &fake);
1403         } else {
1404                 return NULL;
1405         }
1406 }
1407
1408 static void gem_record_rings(struct i915_gpu_state *error)
1409 {
1410         struct drm_i915_private *i915 = error->i915;
1411         struct i915_ggtt *ggtt = &i915->ggtt;
1412         int i;
1413
1414         for (i = 0; i < I915_NUM_ENGINES; i++) {
1415                 struct intel_engine_cs *engine = i915->engine[i];
1416                 struct drm_i915_error_engine *ee = &error->engine[i];
1417                 struct i915_request *request;
1418
1419                 ee->engine_id = -1;
1420
1421                 if (!engine)
1422                         continue;
1423
1424                 ee->engine_id = i;
1425
1426                 error_record_engine_registers(error, engine, ee);
1427                 error_record_engine_execlists(engine, ee);
1428
1429                 request = intel_engine_find_active_request(engine);
1430                 if (request) {
1431                         struct i915_gem_context *ctx = request->gem_context;
1432                         struct intel_ring *ring;
1433
1434                         ee->vm = ctx->vm ?: &ggtt->vm;
1435
1436                         record_context(&ee->context, ctx);
1437
1438                         /* We need to copy these to an anonymous buffer
1439                          * as the simplest method to avoid being overwritten
1440                          * by userspace.
1441                          */
1442                         ee->batchbuffer =
1443                                 i915_error_object_create(i915, request->batch);
1444
1445                         if (HAS_BROKEN_CS_TLB(i915))
1446                                 ee->wa_batchbuffer =
1447                                         i915_error_object_create(i915,
1448                                                                  i915->gt.scratch);
1449                         request_record_user_bo(request, ee);
1450
1451                         ee->ctx =
1452                                 i915_error_object_create(i915,
1453                                                          request->hw_context->state);
1454
1455                         error->simulated |=
1456                                 i915_gem_context_no_error_capture(ctx);
1457
1458                         ee->rq_head = request->head;
1459                         ee->rq_post = request->postfix;
1460                         ee->rq_tail = request->tail;
1461
1462                         ring = request->ring;
1463                         ee->cpu_ring_head = ring->head;
1464                         ee->cpu_ring_tail = ring->tail;
1465                         ee->ringbuffer =
1466                                 i915_error_object_create(i915, ring->vma);
1467
1468                         engine_record_requests(engine, request, ee);
1469                 }
1470
1471                 ee->hws_page =
1472                         i915_error_object_create(i915,
1473                                                  engine->status_page.vma);
1474
1475                 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1476
1477                 ee->default_state = capture_object(i915, engine->default_state);
1478         }
1479 }
1480
1481 static void gem_capture_vm(struct i915_gpu_state *error,
1482                            struct i915_address_space *vm,
1483                            int idx)
1484 {
1485         struct drm_i915_error_buffer *active_bo;
1486         struct i915_vma *vma;
1487         int count;
1488
1489         count = 0;
1490         list_for_each_entry(vma, &vm->bound_list, vm_link)
1491                 if (i915_vma_is_active(vma))
1492                         count++;
1493
1494         active_bo = NULL;
1495         if (count)
1496                 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1497         if (active_bo)
1498                 count = capture_error_bo(active_bo,
1499                                          count, &vm->bound_list,
1500                                          ACTIVE_ONLY);
1501         else
1502                 count = 0;
1503
1504         error->active_vm[idx] = vm;
1505         error->active_bo[idx] = active_bo;
1506         error->active_bo_count[idx] = count;
1507 }
1508
1509 static void capture_active_buffers(struct i915_gpu_state *error)
1510 {
1511         int cnt = 0, i, j;
1512
1513         BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
1514         BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
1515         BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
1516
1517         /* Scan each engine looking for unique active contexts/vm */
1518         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1519                 struct drm_i915_error_engine *ee = &error->engine[i];
1520                 bool found;
1521
1522                 if (!ee->vm)
1523                         continue;
1524
1525                 found = false;
1526                 for (j = 0; j < i && !found; j++)
1527                         found = error->engine[j].vm == ee->vm;
1528                 if (!found)
1529                         gem_capture_vm(error, ee->vm, cnt++);
1530         }
1531 }
1532
1533 static void capture_pinned_buffers(struct i915_gpu_state *error)
1534 {
1535         struct i915_address_space *vm = &error->i915->ggtt.vm;
1536         struct drm_i915_error_buffer *bo;
1537         struct i915_vma *vma;
1538         int count;
1539
1540         count = 0;
1541         list_for_each_entry(vma, &vm->bound_list, vm_link)
1542                 count++;
1543
1544         bo = NULL;
1545         if (count)
1546                 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
1547         if (!bo)
1548                 return;
1549
1550         error->pinned_bo_count =
1551                 capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
1552         error->pinned_bo = bo;
1553 }
1554
1555 static void capture_uc_state(struct i915_gpu_state *error)
1556 {
1557         struct drm_i915_private *i915 = error->i915;
1558         struct i915_error_uc *error_uc = &error->uc;
1559
1560         /* Capturing uC state won't be useful if there is no GuC */
1561         if (!error->device_info.has_guc)
1562                 return;
1563
1564         error_uc->guc_fw = i915->guc.fw;
1565         error_uc->huc_fw = i915->huc.fw;
1566
1567         /* Non-default firmware paths will be specified by the modparam.
1568          * As modparams are generally accesible from the userspace make
1569          * explicit copies of the firmware paths.
1570          */
1571         error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
1572         error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1573         error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1574 }
1575
1576 /* Capture all registers which don't fit into another category. */
1577 static void capture_reg_state(struct i915_gpu_state *error)
1578 {
1579         struct drm_i915_private *i915 = error->i915;
1580         struct intel_uncore *uncore = &i915->uncore;
1581         int i;
1582
1583         /* General organization
1584          * 1. Registers specific to a single generation
1585          * 2. Registers which belong to multiple generations
1586          * 3. Feature specific registers.
1587          * 4. Everything else
1588          * Please try to follow the order.
1589          */
1590
1591         /* 1: Registers specific to a single generation */
1592         if (IS_VALLEYVIEW(i915)) {
1593                 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1594                 error->ier = intel_uncore_read(uncore, VLV_IER);
1595                 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
1596         }
1597
1598         if (IS_GEN(i915, 7))
1599                 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
1600
1601         if (INTEL_GEN(i915) >= 8) {
1602                 error->fault_data0 = intel_uncore_read(uncore,
1603                                                        GEN8_FAULT_TLB_DATA0);
1604                 error->fault_data1 = intel_uncore_read(uncore,
1605                                                        GEN8_FAULT_TLB_DATA1);
1606         }
1607
1608         if (IS_GEN(i915, 6)) {
1609                 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1610                 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1611                 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
1612         }
1613
1614         /* 2: Registers which belong to multiple generations */
1615         if (INTEL_GEN(i915) >= 7)
1616                 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
1617
1618         if (INTEL_GEN(i915) >= 6) {
1619                 error->derrmr = intel_uncore_read(uncore, DERRMR);
1620                 error->error = intel_uncore_read(uncore, ERROR_GEN6);
1621                 error->done_reg = intel_uncore_read(uncore, DONE_REG);
1622         }
1623
1624         if (INTEL_GEN(i915) >= 5)
1625                 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
1626
1627         /* 3: Feature specific registers */
1628         if (IS_GEN_RANGE(i915, 6, 7)) {
1629                 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1630                 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
1631         }
1632
1633         /* 4: Everything else */
1634         if (INTEL_GEN(i915) >= 11) {
1635                 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1636                 error->gtier[0] =
1637                         intel_uncore_read(uncore,
1638                                           GEN11_RENDER_COPY_INTR_ENABLE);
1639                 error->gtier[1] =
1640                         intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1641                 error->gtier[2] =
1642                         intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1643                 error->gtier[3] =
1644                         intel_uncore_read(uncore,
1645                                           GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1646                 error->gtier[4] =
1647                         intel_uncore_read(uncore,
1648                                           GEN11_CRYPTO_RSVD_INTR_ENABLE);
1649                 error->gtier[5] =
1650                         intel_uncore_read(uncore,
1651                                           GEN11_GUNIT_CSME_INTR_ENABLE);
1652                 error->ngtier = 6;
1653         } else if (INTEL_GEN(i915) >= 8) {
1654                 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1655                 for (i = 0; i < 4; i++)
1656                         error->gtier[i] = intel_uncore_read(uncore,
1657                                                             GEN8_GT_IER(i));
1658                 error->ngtier = 4;
1659         } else if (HAS_PCH_SPLIT(i915)) {
1660                 error->ier = intel_uncore_read(uncore, DEIER);
1661                 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1662                 error->ngtier = 1;
1663         } else if (IS_GEN(i915, 2)) {
1664                 error->ier = intel_uncore_read16(uncore, GEN2_IER);
1665         } else if (!IS_VALLEYVIEW(i915)) {
1666                 error->ier = intel_uncore_read(uncore, GEN2_IER);
1667         }
1668         error->eir = intel_uncore_read(uncore, EIR);
1669         error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1670 }
1671
1672 static const char *
1673 error_msg(struct i915_gpu_state *error,
1674           intel_engine_mask_t engines, const char *msg)
1675 {
1676         int len;
1677         int i;
1678
1679         for (i = 0; i < ARRAY_SIZE(error->engine); i++)
1680                 if (!error->engine[i].context.pid)
1681                         engines &= ~BIT(i);
1682
1683         len = scnprintf(error->error_msg, sizeof(error->error_msg),
1684                         "GPU HANG: ecode %d:%x:0x%08x",
1685                         INTEL_GEN(error->i915), engines,
1686                         i915_error_generate_code(error, engines));
1687         if (engines) {
1688                 /* Just show the first executing process, more is confusing */
1689                 i = __ffs(engines);
1690                 len += scnprintf(error->error_msg + len,
1691                                  sizeof(error->error_msg) - len,
1692                                  ", in %s [%d]",
1693                                  error->engine[i].context.comm,
1694                                  error->engine[i].context.pid);
1695         }
1696         if (msg)
1697                 len += scnprintf(error->error_msg + len,
1698                                  sizeof(error->error_msg) - len,
1699                                  ", %s", msg);
1700
1701         return error->error_msg;
1702 }
1703
1704 static void capture_gen_state(struct i915_gpu_state *error)
1705 {
1706         struct drm_i915_private *i915 = error->i915;
1707
1708         error->awake = i915->gt.awake;
1709         error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1710         error->suspended = i915->runtime_pm.suspended;
1711
1712         error->iommu = -1;
1713 #ifdef CONFIG_INTEL_IOMMU
1714         error->iommu = intel_iommu_gfx_mapped;
1715 #endif
1716         error->reset_count = i915_reset_count(&i915->gpu_error);
1717         error->suspend_count = i915->suspend_count;
1718
1719         memcpy(&error->device_info,
1720                INTEL_INFO(i915),
1721                sizeof(error->device_info));
1722         memcpy(&error->runtime_info,
1723                RUNTIME_INFO(i915),
1724                sizeof(error->runtime_info));
1725         error->driver_caps = i915->caps;
1726 }
1727
1728 static void capture_params(struct i915_gpu_state *error)
1729 {
1730         i915_params_copy(&error->params, &i915_modparams);
1731 }
1732
1733 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1734 {
1735         unsigned long epoch = error->capture;
1736         int i;
1737
1738         for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1739                 const struct drm_i915_error_engine *ee = &error->engine[i];
1740
1741                 if (ee->hangcheck_timestamp &&
1742                     time_before(ee->hangcheck_timestamp, epoch))
1743                         epoch = ee->hangcheck_timestamp;
1744         }
1745
1746         return epoch;
1747 }
1748
1749 static void capture_finish(struct i915_gpu_state *error)
1750 {
1751         struct i915_ggtt *ggtt = &error->i915->ggtt;
1752         const u64 slot = ggtt->error_capture.start;
1753
1754         ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1755 }
1756
1757 static int capture(void *data)
1758 {
1759         struct i915_gpu_state *error = data;
1760
1761         error->time = ktime_get_real();
1762         error->boottime = ktime_get_boottime();
1763         error->uptime = ktime_sub(ktime_get(),
1764                                   error->i915->gt.last_init_time);
1765         error->capture = jiffies;
1766
1767         capture_params(error);
1768         capture_gen_state(error);
1769         capture_uc_state(error);
1770         capture_reg_state(error);
1771         gem_record_fences(error);
1772         gem_record_rings(error);
1773         capture_active_buffers(error);
1774         capture_pinned_buffers(error);
1775
1776         error->overlay = intel_overlay_capture_error_state(error->i915);
1777         error->display = intel_display_capture_error_state(error->i915);
1778
1779         error->epoch = capture_find_epoch(error);
1780
1781         capture_finish(error);
1782         return 0;
1783 }
1784
1785 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1786
1787 struct i915_gpu_state *
1788 i915_capture_gpu_state(struct drm_i915_private *i915)
1789 {
1790         struct i915_gpu_state *error;
1791
1792         /* Check if GPU capture has been disabled */
1793         error = READ_ONCE(i915->gpu_error.first_error);
1794         if (IS_ERR(error))
1795                 return error;
1796
1797         error = kzalloc(sizeof(*error), GFP_ATOMIC);
1798         if (!error) {
1799                 i915_disable_error_state(i915, -ENOMEM);
1800                 return ERR_PTR(-ENOMEM);
1801         }
1802
1803         kref_init(&error->ref);
1804         error->i915 = i915;
1805
1806         stop_machine(capture, error, NULL);
1807
1808         return error;
1809 }
1810
1811 /**
1812  * i915_capture_error_state - capture an error record for later analysis
1813  * @i915: i915 device
1814  * @engine_mask: the mask of engines triggering the hang
1815  * @msg: a message to insert into the error capture header
1816  *
1817  * Should be called when an error is detected (either a hang or an error
1818  * interrupt) to capture error state from the time of the error.  Fills
1819  * out a structure which becomes available in debugfs for user level tools
1820  * to pick up.
1821  */
1822 void i915_capture_error_state(struct drm_i915_private *i915,
1823                               intel_engine_mask_t engine_mask,
1824                               const char *msg)
1825 {
1826         static bool warned;
1827         struct i915_gpu_state *error;
1828         unsigned long flags;
1829
1830         if (!i915_modparams.error_capture)
1831                 return;
1832
1833         if (READ_ONCE(i915->gpu_error.first_error))
1834                 return;
1835
1836         error = i915_capture_gpu_state(i915);
1837         if (IS_ERR(error))
1838                 return;
1839
1840         dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
1841
1842         if (!error->simulated) {
1843                 spin_lock_irqsave(&i915->gpu_error.lock, flags);
1844                 if (!i915->gpu_error.first_error) {
1845                         i915->gpu_error.first_error = error;
1846                         error = NULL;
1847                 }
1848                 spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1849         }
1850
1851         if (error) {
1852                 __i915_gpu_state_free(&error->ref);
1853                 return;
1854         }
1855
1856         if (!warned &&
1857             ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1858                 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1859                 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1860                 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1861                 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1862                 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1863                          i915->drm.primary->index);
1864                 warned = true;
1865         }
1866 }
1867
1868 struct i915_gpu_state *
1869 i915_first_error_state(struct drm_i915_private *i915)
1870 {
1871         struct i915_gpu_state *error;
1872
1873         spin_lock_irq(&i915->gpu_error.lock);
1874         error = i915->gpu_error.first_error;
1875         if (!IS_ERR_OR_NULL(error))
1876                 i915_gpu_state_get(error);
1877         spin_unlock_irq(&i915->gpu_error.lock);
1878
1879         return error;
1880 }
1881
1882 void i915_reset_error_state(struct drm_i915_private *i915)
1883 {
1884         struct i915_gpu_state *error;
1885
1886         spin_lock_irq(&i915->gpu_error.lock);
1887         error = i915->gpu_error.first_error;
1888         if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1889                 i915->gpu_error.first_error = NULL;
1890         spin_unlock_irq(&i915->gpu_error.lock);
1891
1892         if (!IS_ERR_OR_NULL(error))
1893                 i915_gpu_state_put(error);
1894 }
1895
1896 void i915_disable_error_state(struct drm_i915_private *i915, int err)
1897 {
1898         spin_lock_irq(&i915->gpu_error.lock);
1899         if (!i915->gpu_error.first_error)
1900                 i915->gpu_error.first_error = ERR_PTR(err);
1901         spin_unlock_irq(&i915->gpu_error.lock);
1902 }