2 * Copyright (c) 2008 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Eric Anholt <eric@anholt.net>
25 * Keith Packard <keithp@keithp.com>
26 * Mika Kuoppala <mika.kuoppala@intel.com>
30 #include <linux/ascii85.h>
31 #include <linux/nmi.h>
32 #include <linux/scatterlist.h>
33 #include <linux/stop_machine.h>
34 #include <linux/utsname.h>
35 #include <linux/zlib.h>
37 #include <drm/drm_print.h>
39 #include "gem/i915_gem_context.h"
42 #include "i915_gpu_error.h"
43 #include "i915_scatterlist.h"
44 #include "intel_atomic.h"
45 #include "intel_csr.h"
46 #include "intel_overlay.h"
48 static inline const struct intel_engine_cs *
49 engine_lookup(const struct drm_i915_private *i915, unsigned int id)
51 if (id >= I915_NUM_ENGINES)
54 return i915->engine[id];
57 static inline const char *
58 __engine_name(const struct intel_engine_cs *engine)
60 return engine ? engine->name : "";
64 engine_name(const struct drm_i915_private *i915, unsigned int id)
66 return __engine_name(engine_lookup(i915, id));
69 static const char *tiling_flag(int tiling)
73 case I915_TILING_NONE: return "";
74 case I915_TILING_X: return " X";
75 case I915_TILING_Y: return " Y";
79 static const char *dirty_flag(int dirty)
81 return dirty ? " dirty" : "";
84 static const char *purgeable_flag(int purgeable)
86 return purgeable ? " purgeable" : "";
89 static void __sg_set_buf(struct scatterlist *sg,
90 void *addr, unsigned int len, loff_t it)
92 sg->page_link = (unsigned long)virt_to_page(addr);
93 sg->offset = offset_in_page(addr);
98 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
103 if (e->bytes + len + 1 <= e->size)
107 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
113 if (e->cur == e->end) {
114 struct scatterlist *sgl;
116 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
126 (unsigned long)sgl | SG_CHAIN;
132 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
135 e->size = ALIGN(len + 1, SZ_64K);
136 e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
138 e->size = PAGE_ALIGN(len + 1);
139 e->buf = kmalloc(e->size, GFP_KERNEL);
150 static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
151 const char *fmt, va_list args)
160 len = vsnprintf(NULL, 0, fmt, ap);
167 if (!__i915_error_grow(e, len))
170 GEM_BUG_ON(e->bytes >= e->size);
171 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
179 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
187 if (!__i915_error_grow(e, len))
190 GEM_BUG_ON(e->bytes + len > e->size);
191 memcpy(e->buf + e->bytes, str, len);
195 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
196 #define err_puts(e, s) i915_error_puts(e, s)
198 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
200 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
203 static inline struct drm_printer
204 i915_error_printer(struct drm_i915_error_state_buf *e)
206 struct drm_printer p = {
207 .printfn = __i915_printfn_error,
213 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
216 struct z_stream_s zstream;
220 static bool compress_init(struct compress *c)
222 struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
225 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
226 GFP_ATOMIC | __GFP_NOWARN);
227 if (!zstream->workspace)
230 if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
231 kfree(zstream->workspace);
236 if (i915_has_memcpy_from_wc())
237 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
242 static void *compress_next_page(struct drm_i915_error_object *dst)
246 if (dst->page_count >= dst->num_pages)
247 return ERR_PTR(-ENOSPC);
249 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
251 return ERR_PTR(-ENOMEM);
253 return dst->pages[dst->page_count++] = (void *)page;
256 static int compress_page(struct compress *c,
258 struct drm_i915_error_object *dst)
260 struct z_stream_s *zstream = &c->zstream;
262 zstream->next_in = src;
263 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
264 zstream->next_in = c->tmp;
265 zstream->avail_in = PAGE_SIZE;
268 if (zstream->avail_out == 0) {
269 zstream->next_out = compress_next_page(dst);
270 if (IS_ERR(zstream->next_out))
271 return PTR_ERR(zstream->next_out);
273 zstream->avail_out = PAGE_SIZE;
276 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
279 touch_nmi_watchdog();
280 } while (zstream->avail_in);
282 /* Fallback to uncompressed if we increase size? */
283 if (0 && zstream->total_out > zstream->total_in)
289 static int compress_flush(struct compress *c,
290 struct drm_i915_error_object *dst)
292 struct z_stream_s *zstream = &c->zstream;
295 switch (zlib_deflate(zstream, Z_FINISH)) {
296 case Z_OK: /* more space requested */
297 zstream->next_out = compress_next_page(dst);
298 if (IS_ERR(zstream->next_out))
299 return PTR_ERR(zstream->next_out);
301 zstream->avail_out = PAGE_SIZE;
307 default: /* any error */
313 memset(zstream->next_out, 0, zstream->avail_out);
314 dst->unused = zstream->avail_out;
318 static void compress_fini(struct compress *c,
319 struct drm_i915_error_object *dst)
321 struct z_stream_s *zstream = &c->zstream;
323 zlib_deflateEnd(zstream);
324 kfree(zstream->workspace);
326 free_page((unsigned long)c->tmp);
329 static void err_compression_marker(struct drm_i915_error_state_buf *m)
339 static bool compress_init(struct compress *c)
344 static int compress_page(struct compress *c,
346 struct drm_i915_error_object *dst)
351 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
356 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
357 memcpy(ptr, src, PAGE_SIZE);
358 dst->pages[dst->page_count++] = ptr;
363 static int compress_flush(struct compress *c,
364 struct drm_i915_error_object *dst)
369 static void compress_fini(struct compress *c,
370 struct drm_i915_error_object *dst)
374 static void err_compression_marker(struct drm_i915_error_state_buf *m)
381 static void print_error_buffers(struct drm_i915_error_state_buf *m,
383 struct drm_i915_error_buffer *err,
386 err_printf(m, "%s [%d]:\n", name, count);
389 err_printf(m, " %08x_%08x %8u %02x %02x",
390 upper_32_bits(err->gtt_offset),
391 lower_32_bits(err->gtt_offset),
395 err_puts(m, tiling_flag(err->tiling));
396 err_puts(m, dirty_flag(err->dirty));
397 err_puts(m, purgeable_flag(err->purgeable));
398 err_puts(m, err->userptr ? " userptr" : "");
399 err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
402 err_printf(m, " (name: %d)", err->name);
403 if (err->fence_reg != I915_FENCE_REG_NONE)
404 err_printf(m, " (fence: %d)", err->fence_reg);
411 static void error_print_instdone(struct drm_i915_error_state_buf *m,
412 const struct drm_i915_error_engine *ee)
417 err_printf(m, " INSTDONE: 0x%08x\n",
418 ee->instdone.instdone);
420 if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
423 err_printf(m, " SC_INSTDONE: 0x%08x\n",
424 ee->instdone.slice_common);
426 if (INTEL_GEN(m->i915) <= 6)
429 for_each_instdone_slice_subslice(m->i915, slice, subslice)
430 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
432 ee->instdone.sampler[slice][subslice]);
434 for_each_instdone_slice_subslice(m->i915, slice, subslice)
435 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
437 ee->instdone.row[slice][subslice]);
440 static void error_print_request(struct drm_i915_error_state_buf *m,
442 const struct drm_i915_error_request *erq,
443 const unsigned long epoch)
448 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
449 prefix, erq->pid, erq->context, erq->seqno,
450 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
451 &erq->flags) ? "!" : "",
452 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
453 &erq->flags) ? "+" : "",
454 erq->sched_attr.priority,
455 jiffies_to_msecs(erq->jiffies - epoch),
456 erq->start, erq->head, erq->tail);
459 static void error_print_context(struct drm_i915_error_state_buf *m,
461 const struct drm_i915_error_context *ctx)
463 err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n",
464 header, ctx->comm, ctx->pid, ctx->hw_id,
465 ctx->sched_attr.priority, ctx->guilty, ctx->active);
468 static void error_print_engine(struct drm_i915_error_state_buf *m,
469 const struct drm_i915_error_engine *ee,
470 const unsigned long epoch)
474 err_printf(m, "%s command stream:\n",
475 engine_name(m->i915, ee->engine_id));
476 err_printf(m, " IDLE?: %s\n", yesno(ee->idle));
477 err_printf(m, " START: 0x%08x\n", ee->start);
478 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
479 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
480 ee->tail, ee->rq_post, ee->rq_tail);
481 err_printf(m, " CTL: 0x%08x\n", ee->ctl);
482 err_printf(m, " MODE: 0x%08x\n", ee->mode);
483 err_printf(m, " HWS: 0x%08x\n", ee->hws);
484 err_printf(m, " ACTHD: 0x%08x %08x\n",
485 (u32)(ee->acthd>>32), (u32)ee->acthd);
486 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
487 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
489 error_print_instdone(m, ee);
491 if (ee->batchbuffer) {
492 u64 start = ee->batchbuffer->gtt_offset;
493 u64 end = start + ee->batchbuffer->gtt_size;
495 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
496 upper_32_bits(start), lower_32_bits(start),
497 upper_32_bits(end), lower_32_bits(end));
499 if (INTEL_GEN(m->i915) >= 4) {
500 err_printf(m, " BBADDR: 0x%08x_%08x\n",
501 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
502 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
503 err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
505 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
506 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
507 lower_32_bits(ee->faddr));
508 if (INTEL_GEN(m->i915) >= 6) {
509 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
510 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
512 if (HAS_PPGTT(m->i915)) {
513 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
515 if (INTEL_GEN(m->i915) >= 8) {
517 for (i = 0; i < 4; i++)
518 err_printf(m, " PDP%d: 0x%016llx\n",
519 i, ee->vm_info.pdp[i]);
521 err_printf(m, " PP_DIR_BASE: 0x%08x\n",
522 ee->vm_info.pp_dir_base);
525 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
526 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
527 err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
528 jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
529 ee->hangcheck_timestamp,
530 ee->hangcheck_timestamp == epoch ? "; epoch" : "");
531 err_printf(m, " engine reset count: %u\n", ee->reset_count);
533 for (n = 0; n < ee->num_ports; n++) {
534 err_printf(m, " ELSP[%d]:", n);
535 error_print_request(m, " ", &ee->execlist[n], epoch);
538 error_print_context(m, " Active context: ", &ee->context);
541 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
546 i915_error_vprintf(e, f, args);
550 static void print_error_obj(struct drm_i915_error_state_buf *m,
551 struct intel_engine_cs *engine,
553 struct drm_i915_error_object *obj)
555 char out[ASCII85_BUFSZ];
562 err_printf(m, "%s --- %s = 0x%08x %08x\n",
563 engine ? engine->name : "global", name,
564 upper_32_bits(obj->gtt_offset),
565 lower_32_bits(obj->gtt_offset));
568 err_compression_marker(m);
569 for (page = 0; page < obj->page_count; page++) {
573 if (page == obj->page_count - 1)
575 len = ascii85_encode_len(len);
577 for (i = 0; i < len; i++)
578 err_puts(m, ascii85_encode(obj->pages[page][i], out));
583 static void err_print_capabilities(struct drm_i915_error_state_buf *m,
584 const struct intel_device_info *info,
585 const struct intel_runtime_info *runtime,
586 const struct intel_driver_caps *caps)
588 struct drm_printer p = i915_error_printer(m);
590 intel_device_info_dump_flags(info, &p);
591 intel_driver_caps_print(caps, &p);
592 intel_device_info_dump_topology(&runtime->sseu, &p);
595 static void err_print_params(struct drm_i915_error_state_buf *m,
596 const struct i915_params *params)
598 struct drm_printer p = i915_error_printer(m);
600 i915_params_dump(params, &p);
603 static void err_print_pciid(struct drm_i915_error_state_buf *m,
604 struct drm_i915_private *i915)
606 struct pci_dev *pdev = i915->drm.pdev;
608 err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
609 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
610 err_printf(m, "PCI Subsystem: %04x:%04x\n",
611 pdev->subsystem_vendor,
612 pdev->subsystem_device);
615 static void err_print_uc(struct drm_i915_error_state_buf *m,
616 const struct i915_error_uc *error_uc)
618 struct drm_printer p = i915_error_printer(m);
619 const struct i915_gpu_state *error =
620 container_of(error_uc, typeof(*error), uc);
622 if (!error->device_info.has_guc)
625 intel_uc_fw_dump(&error_uc->guc_fw, &p);
626 intel_uc_fw_dump(&error_uc->huc_fw, &p);
627 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
630 static void err_free_sgl(struct scatterlist *sgl)
633 struct scatterlist *sg;
635 for (sg = sgl; !sg_is_chain(sg); sg++) {
641 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
642 free_page((unsigned long)sgl);
647 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
648 struct i915_gpu_state *error)
650 struct drm_i915_error_object *obj;
651 struct timespec64 ts;
654 if (*error->error_msg)
655 err_printf(m, "%s\n", error->error_msg);
656 err_printf(m, "Kernel: %s %s\n",
657 init_utsname()->release,
658 init_utsname()->machine);
659 ts = ktime_to_timespec64(error->time);
660 err_printf(m, "Time: %lld s %ld us\n",
661 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
662 ts = ktime_to_timespec64(error->boottime);
663 err_printf(m, "Boottime: %lld s %ld us\n",
664 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
665 ts = ktime_to_timespec64(error->uptime);
666 err_printf(m, "Uptime: %lld s %ld us\n",
667 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
668 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
669 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
671 jiffies_to_msecs(jiffies - error->capture),
672 jiffies_to_msecs(error->capture - error->epoch));
674 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
675 if (!error->engine[i].context.pid)
678 err_printf(m, "Active process (on ring %s): %s [%d]\n",
679 engine_name(m->i915, i),
680 error->engine[i].context.comm,
681 error->engine[i].context.pid);
683 err_printf(m, "Reset count: %u\n", error->reset_count);
684 err_printf(m, "Suspend count: %u\n", error->suspend_count);
685 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
686 err_printf(m, "Subplatform: 0x%x\n",
687 intel_subplatform(&error->runtime_info,
688 error->device_info.platform));
689 err_print_pciid(m, m->i915);
691 err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
693 if (HAS_CSR(m->i915)) {
694 struct intel_csr *csr = &m->i915->csr;
696 err_printf(m, "DMC loaded: %s\n",
697 yesno(csr->dmc_payload != NULL));
698 err_printf(m, "DMC fw version: %d.%d\n",
699 CSR_VERSION_MAJOR(csr->version),
700 CSR_VERSION_MINOR(csr->version));
703 err_printf(m, "GT awake: %s\n", yesno(error->awake));
704 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
705 err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
706 err_printf(m, "EIR: 0x%08x\n", error->eir);
707 err_printf(m, "IER: 0x%08x\n", error->ier);
708 for (i = 0; i < error->ngtier; i++)
709 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
710 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
711 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
712 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
713 err_printf(m, "CCID: 0x%08x\n", error->ccid);
715 for (i = 0; i < error->nfence; i++)
716 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
718 if (INTEL_GEN(m->i915) >= 6) {
719 err_printf(m, "ERROR: 0x%08x\n", error->error);
721 if (INTEL_GEN(m->i915) >= 8)
722 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
723 error->fault_data1, error->fault_data0);
725 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
728 if (IS_GEN(m->i915, 7))
729 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
731 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
732 if (error->engine[i].engine_id != -1)
733 error_print_engine(m, &error->engine[i], error->epoch);
736 for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
740 if (!error->active_vm[i])
743 len = scnprintf(buf, sizeof(buf), "Active (");
744 for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
745 if (error->engine[j].vm != error->active_vm[i])
748 len += scnprintf(buf + len, sizeof(buf), "%s%s",
750 m->i915->engine[j]->name);
753 scnprintf(buf + len, sizeof(buf), ")");
754 print_error_buffers(m, buf,
756 error->active_bo_count[i]);
759 print_error_buffers(m, "Pinned (global)",
761 error->pinned_bo_count);
763 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
764 const struct drm_i915_error_engine *ee = &error->engine[i];
766 obj = ee->batchbuffer;
768 err_puts(m, m->i915->engine[i]->name);
770 err_printf(m, " (submitted by %s [%d])",
773 err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
774 upper_32_bits(obj->gtt_offset),
775 lower_32_bits(obj->gtt_offset));
776 print_error_obj(m, m->i915->engine[i], NULL, obj);
779 for (j = 0; j < ee->user_bo_count; j++)
780 print_error_obj(m, m->i915->engine[i],
781 "user", ee->user_bo[j]);
783 if (ee->num_requests) {
784 err_printf(m, "%s --- %d requests\n",
785 m->i915->engine[i]->name,
787 for (j = 0; j < ee->num_requests; j++)
788 error_print_request(m, " ",
793 print_error_obj(m, m->i915->engine[i],
794 "ringbuffer", ee->ringbuffer);
796 print_error_obj(m, m->i915->engine[i],
797 "HW Status", ee->hws_page);
799 print_error_obj(m, m->i915->engine[i],
800 "HW context", ee->ctx);
802 print_error_obj(m, m->i915->engine[i],
803 "WA context", ee->wa_ctx);
805 print_error_obj(m, m->i915->engine[i],
806 "WA batchbuffer", ee->wa_batchbuffer);
808 print_error_obj(m, m->i915->engine[i],
809 "NULL context", ee->default_state);
813 intel_overlay_print_error_state(m, error->overlay);
816 intel_display_print_error_state(m, error->display);
818 err_print_capabilities(m, &error->device_info, &error->runtime_info,
819 &error->driver_caps);
820 err_print_params(m, &error->params);
821 err_print_uc(m, &error->uc);
824 static int err_print_to_sgl(struct i915_gpu_state *error)
826 struct drm_i915_error_state_buf m;
829 return PTR_ERR(error);
831 if (READ_ONCE(error->sgl))
834 memset(&m, 0, sizeof(m));
835 m.i915 = error->i915;
837 __err_print_to_sgl(&m, error);
840 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
845 GEM_BUG_ON(m.end < m.cur);
846 sg_mark_end(m.cur - 1);
848 GEM_BUG_ON(m.sgl && !m.cur);
855 if (cmpxchg(&error->sgl, NULL, m.sgl))
861 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
862 char *buf, loff_t off, size_t rem)
864 struct scatterlist *sg;
872 err = err_print_to_sgl(error);
876 sg = READ_ONCE(error->fit);
877 if (!sg || off < sg->dma_address)
882 pos = sg->dma_address;
887 if (sg_is_chain(sg)) {
888 sg = sg_chain_ptr(sg);
889 GEM_BUG_ON(sg_is_chain(sg));
893 if (pos + len <= off) {
900 GEM_BUG_ON(off - pos > len);
907 GEM_BUG_ON(!len || len > sg->length);
909 memcpy(buf, page_address(sg_page(sg)) + start, len);
917 WRITE_ONCE(error->fit, sg);
920 } while (!sg_is_last(sg++));
925 static void i915_error_object_free(struct drm_i915_error_object *obj)
932 for (page = 0; page < obj->page_count; page++)
933 free_page((unsigned long)obj->pages[page]);
939 static void cleanup_params(struct i915_gpu_state *error)
941 i915_params_free(&error->params);
944 static void cleanup_uc_state(struct i915_gpu_state *error)
946 struct i915_error_uc *error_uc = &error->uc;
948 kfree(error_uc->guc_fw.path);
949 kfree(error_uc->huc_fw.path);
950 i915_error_object_free(error_uc->guc_log);
953 void __i915_gpu_state_free(struct kref *error_ref)
955 struct i915_gpu_state *error =
956 container_of(error_ref, typeof(*error), ref);
959 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
960 struct drm_i915_error_engine *ee = &error->engine[i];
962 for (j = 0; j < ee->user_bo_count; j++)
963 i915_error_object_free(ee->user_bo[j]);
966 i915_error_object_free(ee->batchbuffer);
967 i915_error_object_free(ee->wa_batchbuffer);
968 i915_error_object_free(ee->ringbuffer);
969 i915_error_object_free(ee->hws_page);
970 i915_error_object_free(ee->ctx);
971 i915_error_object_free(ee->wa_ctx);
976 for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
977 kfree(error->active_bo[i]);
978 kfree(error->pinned_bo);
980 kfree(error->overlay);
981 kfree(error->display);
983 cleanup_params(error);
984 cleanup_uc_state(error);
986 err_free_sgl(error->sgl);
990 static struct drm_i915_error_object *
991 i915_error_object_create(struct drm_i915_private *i915,
992 struct i915_vma *vma)
994 struct i915_ggtt *ggtt = &i915->ggtt;
995 const u64 slot = ggtt->error_capture.start;
996 struct drm_i915_error_object *dst;
997 struct compress compress;
998 unsigned long num_pages;
999 struct sgt_iter iter;
1003 if (!vma || !vma->pages)
1006 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1007 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1008 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
1009 GFP_ATOMIC | __GFP_NOWARN);
1013 dst->gtt_offset = vma->node.start;
1014 dst->gtt_size = vma->node.size;
1015 dst->num_pages = num_pages;
1016 dst->page_count = 0;
1019 if (!compress_init(&compress)) {
1025 for_each_sgt_dma(dma, iter, vma->pages) {
1028 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1030 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1031 ret = compress_page(&compress, (void __force *)s, dst);
1032 io_mapping_unmap_atomic(s);
1037 if (ret || compress_flush(&compress, dst)) {
1038 while (dst->page_count--)
1039 free_page((unsigned long)dst->pages[dst->page_count]);
1044 compress_fini(&compress, dst);
1048 static void capture_bo(struct drm_i915_error_buffer *err,
1049 struct i915_vma *vma)
1051 struct drm_i915_gem_object *obj = vma->obj;
1053 err->size = obj->base.size;
1054 err->name = obj->base.name;
1056 err->gtt_offset = vma->node.start;
1057 err->read_domains = obj->read_domains;
1058 err->write_domain = obj->write_domain;
1059 err->fence_reg = vma->fence ? vma->fence->id : -1;
1060 err->tiling = i915_gem_object_get_tiling(obj);
1061 err->dirty = obj->mm.dirty;
1062 err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1063 err->userptr = obj->userptr.mm != NULL;
1064 err->cache_level = obj->cache_level;
1067 static u32 capture_error_bo(struct drm_i915_error_buffer *err,
1068 int count, struct list_head *head,
1070 #define ACTIVE_ONLY BIT(0)
1071 #define PINNED_ONLY BIT(1)
1073 struct i915_vma *vma;
1076 list_for_each_entry(vma, head, vm_link) {
1080 if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
1083 if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
1086 capture_bo(err++, vma);
1095 * Generate a semi-unique error code. The code is not meant to have meaning, The
1096 * code's only purpose is to try to prevent false duplicated bug reports by
1097 * grossly estimating a GPU error state.
1099 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1100 * the hang if we could strip the GTT offset information from it.
1102 * It's only a small step better than a random number in its current form.
1104 static u32 i915_error_generate_code(struct i915_gpu_state *error,
1105 intel_engine_mask_t engine_mask)
1108 * IPEHR would be an ideal way to detect errors, as it's the gross
1109 * measure of "the command that hung." However, has some very common
1110 * synchronization commands which almost always appear in the case
1111 * strictly a client bug. Use instdone to differentiate those some.
1114 struct drm_i915_error_engine *ee =
1115 &error->engine[ffs(engine_mask)];
1117 return ee->ipehr ^ ee->instdone.instdone;
1123 static void gem_record_fences(struct i915_gpu_state *error)
1125 struct drm_i915_private *dev_priv = error->i915;
1126 struct intel_uncore *uncore = &dev_priv->uncore;
1129 if (INTEL_GEN(dev_priv) >= 6) {
1130 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1132 intel_uncore_read64(uncore,
1133 FENCE_REG_GEN6_LO(i));
1134 } else if (INTEL_GEN(dev_priv) >= 4) {
1135 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1137 intel_uncore_read64(uncore,
1138 FENCE_REG_965_LO(i));
1140 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1142 intel_uncore_read(uncore, FENCE_REG(i));
1147 static void error_record_engine_registers(struct i915_gpu_state *error,
1148 struct intel_engine_cs *engine,
1149 struct drm_i915_error_engine *ee)
1151 struct drm_i915_private *dev_priv = engine->i915;
1153 if (INTEL_GEN(dev_priv) >= 6) {
1154 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1155 if (INTEL_GEN(dev_priv) >= 8)
1156 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1158 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
1161 if (INTEL_GEN(dev_priv) >= 4) {
1162 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1163 ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1164 ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1165 ee->instps = ENGINE_READ(engine, RING_INSTPS);
1166 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1167 if (INTEL_GEN(dev_priv) >= 8) {
1168 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1169 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1171 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1173 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1174 ee->ipeir = ENGINE_READ(engine, IPEIR);
1175 ee->ipehr = ENGINE_READ(engine, IPEHR);
1178 intel_engine_get_instdone(engine, &ee->instdone);
1180 ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1181 ee->acthd = intel_engine_get_active_head(engine);
1182 ee->start = ENGINE_READ(engine, RING_START);
1183 ee->head = ENGINE_READ(engine, RING_HEAD);
1184 ee->tail = ENGINE_READ(engine, RING_TAIL);
1185 ee->ctl = ENGINE_READ(engine, RING_CTL);
1186 if (INTEL_GEN(dev_priv) > 2)
1187 ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1189 if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1192 if (IS_GEN(dev_priv, 7)) {
1193 switch (engine->id) {
1195 MISSING_CASE(engine->id);
1197 mmio = RENDER_HWS_PGA_GEN7;
1200 mmio = BLT_HWS_PGA_GEN7;
1203 mmio = BSD_HWS_PGA_GEN7;
1206 mmio = VEBOX_HWS_PGA_GEN7;
1209 } else if (IS_GEN(engine->i915, 6)) {
1210 mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1212 /* XXX: gen8 returns to sanity */
1213 mmio = RING_HWS_PGA(engine->mmio_base);
1216 ee->hws = I915_READ(mmio);
1219 ee->idle = intel_engine_is_idle(engine);
1221 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1222 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1225 if (HAS_PPGTT(dev_priv)) {
1228 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
1230 if (IS_GEN(dev_priv, 6)) {
1231 ee->vm_info.pp_dir_base =
1232 ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1233 } else if (IS_GEN(dev_priv, 7)) {
1234 ee->vm_info.pp_dir_base =
1235 ENGINE_READ(engine, RING_PP_DIR_BASE);
1236 } else if (INTEL_GEN(dev_priv) >= 8) {
1237 u32 base = engine->mmio_base;
1239 for (i = 0; i < 4; i++) {
1240 ee->vm_info.pdp[i] =
1241 I915_READ(GEN8_RING_PDP_UDW(base, i));
1242 ee->vm_info.pdp[i] <<= 32;
1243 ee->vm_info.pdp[i] |=
1244 I915_READ(GEN8_RING_PDP_LDW(base, i));
1250 static void record_request(struct i915_request *request,
1251 struct drm_i915_error_request *erq)
1253 struct i915_gem_context *ctx = request->gem_context;
1255 erq->flags = request->fence.flags;
1256 erq->context = request->fence.context;
1257 erq->seqno = request->fence.seqno;
1258 erq->sched_attr = request->sched.attr;
1259 erq->jiffies = request->emitted_jiffies;
1260 erq->start = i915_ggtt_offset(request->ring->vma);
1261 erq->head = request->head;
1262 erq->tail = request->tail;
1265 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1269 static void engine_record_requests(struct intel_engine_cs *engine,
1270 struct i915_request *first,
1271 struct drm_i915_error_engine *ee)
1273 struct i915_request *request;
1278 list_for_each_entry_from(request, &engine->timeline.requests, link)
1283 ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
1287 ee->num_requests = count;
1291 list_for_each_entry_from(request, &engine->timeline.requests, link) {
1292 if (count >= ee->num_requests) {
1294 * If the ring request list was changed in
1295 * between the point where the error request
1296 * list was created and dimensioned and this
1297 * point then just exit early to avoid crashes.
1299 * We don't need to communicate that the
1300 * request list changed state during error
1301 * state capture and that the error state is
1302 * slightly incorrect as a consequence since we
1303 * are typically only interested in the request
1304 * list state at the point of error state
1305 * capture, not in any changes happening during
1311 record_request(request, &ee->requests[count++]);
1313 ee->num_requests = count;
1316 static void error_record_engine_execlists(struct intel_engine_cs *engine,
1317 struct drm_i915_error_engine *ee)
1319 const struct intel_engine_execlists * const execlists = &engine->execlists;
1322 for (n = 0; n < execlists_num_ports(execlists); n++) {
1323 struct i915_request *rq = port_request(&execlists->port[n]);
1328 record_request(rq, &ee->execlist[n]);
1334 static void record_context(struct drm_i915_error_context *e,
1335 struct i915_gem_context *ctx)
1338 struct task_struct *task;
1341 task = pid_task(ctx->pid, PIDTYPE_PID);
1343 strcpy(e->comm, task->comm);
1349 e->hw_id = ctx->hw_id;
1350 e->sched_attr = ctx->sched;
1351 e->guilty = atomic_read(&ctx->guilty_count);
1352 e->active = atomic_read(&ctx->active_count);
1355 static void request_record_user_bo(struct i915_request *request,
1356 struct drm_i915_error_engine *ee)
1358 struct i915_capture_list *c;
1359 struct drm_i915_error_object **bo;
1363 for (c = request->capture_list; c; c = c->next)
1368 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1370 /* If we can't capture everything, try to capture something. */
1371 max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1372 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1378 for (c = request->capture_list; c; c = c->next) {
1379 bo[count] = i915_error_object_create(request->i915, c->vma);
1387 ee->user_bo_count = count;
1390 static struct drm_i915_error_object *
1391 capture_object(struct drm_i915_private *dev_priv,
1392 struct drm_i915_gem_object *obj)
1394 if (obj && i915_gem_object_has_pages(obj)) {
1395 struct i915_vma fake = {
1396 .node = { .start = U64_MAX, .size = obj->base.size },
1397 .size = obj->base.size,
1398 .pages = obj->mm.pages,
1402 return i915_error_object_create(dev_priv, &fake);
1408 static void gem_record_rings(struct i915_gpu_state *error)
1410 struct drm_i915_private *i915 = error->i915;
1411 struct i915_ggtt *ggtt = &i915->ggtt;
1414 for (i = 0; i < I915_NUM_ENGINES; i++) {
1415 struct intel_engine_cs *engine = i915->engine[i];
1416 struct drm_i915_error_engine *ee = &error->engine[i];
1417 struct i915_request *request;
1426 error_record_engine_registers(error, engine, ee);
1427 error_record_engine_execlists(engine, ee);
1429 request = intel_engine_find_active_request(engine);
1431 struct i915_gem_context *ctx = request->gem_context;
1432 struct intel_ring *ring;
1434 ee->vm = ctx->vm ?: &ggtt->vm;
1436 record_context(&ee->context, ctx);
1438 /* We need to copy these to an anonymous buffer
1439 * as the simplest method to avoid being overwritten
1443 i915_error_object_create(i915, request->batch);
1445 if (HAS_BROKEN_CS_TLB(i915))
1446 ee->wa_batchbuffer =
1447 i915_error_object_create(i915,
1449 request_record_user_bo(request, ee);
1452 i915_error_object_create(i915,
1453 request->hw_context->state);
1456 i915_gem_context_no_error_capture(ctx);
1458 ee->rq_head = request->head;
1459 ee->rq_post = request->postfix;
1460 ee->rq_tail = request->tail;
1462 ring = request->ring;
1463 ee->cpu_ring_head = ring->head;
1464 ee->cpu_ring_tail = ring->tail;
1466 i915_error_object_create(i915, ring->vma);
1468 engine_record_requests(engine, request, ee);
1472 i915_error_object_create(i915,
1473 engine->status_page.vma);
1475 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1477 ee->default_state = capture_object(i915, engine->default_state);
1481 static void gem_capture_vm(struct i915_gpu_state *error,
1482 struct i915_address_space *vm,
1485 struct drm_i915_error_buffer *active_bo;
1486 struct i915_vma *vma;
1490 list_for_each_entry(vma, &vm->bound_list, vm_link)
1491 if (i915_vma_is_active(vma))
1496 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1498 count = capture_error_bo(active_bo,
1499 count, &vm->bound_list,
1504 error->active_vm[idx] = vm;
1505 error->active_bo[idx] = active_bo;
1506 error->active_bo_count[idx] = count;
1509 static void capture_active_buffers(struct i915_gpu_state *error)
1513 BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
1514 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
1515 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
1517 /* Scan each engine looking for unique active contexts/vm */
1518 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1519 struct drm_i915_error_engine *ee = &error->engine[i];
1526 for (j = 0; j < i && !found; j++)
1527 found = error->engine[j].vm == ee->vm;
1529 gem_capture_vm(error, ee->vm, cnt++);
1533 static void capture_pinned_buffers(struct i915_gpu_state *error)
1535 struct i915_address_space *vm = &error->i915->ggtt.vm;
1536 struct drm_i915_error_buffer *bo;
1537 struct i915_vma *vma;
1541 list_for_each_entry(vma, &vm->bound_list, vm_link)
1546 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
1550 error->pinned_bo_count =
1551 capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
1552 error->pinned_bo = bo;
1555 static void capture_uc_state(struct i915_gpu_state *error)
1557 struct drm_i915_private *i915 = error->i915;
1558 struct i915_error_uc *error_uc = &error->uc;
1560 /* Capturing uC state won't be useful if there is no GuC */
1561 if (!error->device_info.has_guc)
1564 error_uc->guc_fw = i915->guc.fw;
1565 error_uc->huc_fw = i915->huc.fw;
1567 /* Non-default firmware paths will be specified by the modparam.
1568 * As modparams are generally accesible from the userspace make
1569 * explicit copies of the firmware paths.
1571 error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
1572 error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1573 error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1576 /* Capture all registers which don't fit into another category. */
1577 static void capture_reg_state(struct i915_gpu_state *error)
1579 struct drm_i915_private *i915 = error->i915;
1580 struct intel_uncore *uncore = &i915->uncore;
1583 /* General organization
1584 * 1. Registers specific to a single generation
1585 * 2. Registers which belong to multiple generations
1586 * 3. Feature specific registers.
1587 * 4. Everything else
1588 * Please try to follow the order.
1591 /* 1: Registers specific to a single generation */
1592 if (IS_VALLEYVIEW(i915)) {
1593 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1594 error->ier = intel_uncore_read(uncore, VLV_IER);
1595 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
1598 if (IS_GEN(i915, 7))
1599 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
1601 if (INTEL_GEN(i915) >= 8) {
1602 error->fault_data0 = intel_uncore_read(uncore,
1603 GEN8_FAULT_TLB_DATA0);
1604 error->fault_data1 = intel_uncore_read(uncore,
1605 GEN8_FAULT_TLB_DATA1);
1608 if (IS_GEN(i915, 6)) {
1609 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1610 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1611 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
1614 /* 2: Registers which belong to multiple generations */
1615 if (INTEL_GEN(i915) >= 7)
1616 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
1618 if (INTEL_GEN(i915) >= 6) {
1619 error->derrmr = intel_uncore_read(uncore, DERRMR);
1620 error->error = intel_uncore_read(uncore, ERROR_GEN6);
1621 error->done_reg = intel_uncore_read(uncore, DONE_REG);
1624 if (INTEL_GEN(i915) >= 5)
1625 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
1627 /* 3: Feature specific registers */
1628 if (IS_GEN_RANGE(i915, 6, 7)) {
1629 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1630 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
1633 /* 4: Everything else */
1634 if (INTEL_GEN(i915) >= 11) {
1635 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1637 intel_uncore_read(uncore,
1638 GEN11_RENDER_COPY_INTR_ENABLE);
1640 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1642 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1644 intel_uncore_read(uncore,
1645 GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1647 intel_uncore_read(uncore,
1648 GEN11_CRYPTO_RSVD_INTR_ENABLE);
1650 intel_uncore_read(uncore,
1651 GEN11_GUNIT_CSME_INTR_ENABLE);
1653 } else if (INTEL_GEN(i915) >= 8) {
1654 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1655 for (i = 0; i < 4; i++)
1656 error->gtier[i] = intel_uncore_read(uncore,
1659 } else if (HAS_PCH_SPLIT(i915)) {
1660 error->ier = intel_uncore_read(uncore, DEIER);
1661 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1663 } else if (IS_GEN(i915, 2)) {
1664 error->ier = intel_uncore_read16(uncore, GEN2_IER);
1665 } else if (!IS_VALLEYVIEW(i915)) {
1666 error->ier = intel_uncore_read(uncore, GEN2_IER);
1668 error->eir = intel_uncore_read(uncore, EIR);
1669 error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1673 error_msg(struct i915_gpu_state *error,
1674 intel_engine_mask_t engines, const char *msg)
1679 for (i = 0; i < ARRAY_SIZE(error->engine); i++)
1680 if (!error->engine[i].context.pid)
1683 len = scnprintf(error->error_msg, sizeof(error->error_msg),
1684 "GPU HANG: ecode %d:%x:0x%08x",
1685 INTEL_GEN(error->i915), engines,
1686 i915_error_generate_code(error, engines));
1688 /* Just show the first executing process, more is confusing */
1690 len += scnprintf(error->error_msg + len,
1691 sizeof(error->error_msg) - len,
1693 error->engine[i].context.comm,
1694 error->engine[i].context.pid);
1697 len += scnprintf(error->error_msg + len,
1698 sizeof(error->error_msg) - len,
1701 return error->error_msg;
1704 static void capture_gen_state(struct i915_gpu_state *error)
1706 struct drm_i915_private *i915 = error->i915;
1708 error->awake = i915->gt.awake;
1709 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1710 error->suspended = i915->runtime_pm.suspended;
1713 #ifdef CONFIG_INTEL_IOMMU
1714 error->iommu = intel_iommu_gfx_mapped;
1716 error->reset_count = i915_reset_count(&i915->gpu_error);
1717 error->suspend_count = i915->suspend_count;
1719 memcpy(&error->device_info,
1721 sizeof(error->device_info));
1722 memcpy(&error->runtime_info,
1724 sizeof(error->runtime_info));
1725 error->driver_caps = i915->caps;
1728 static void capture_params(struct i915_gpu_state *error)
1730 i915_params_copy(&error->params, &i915_modparams);
1733 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1735 unsigned long epoch = error->capture;
1738 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1739 const struct drm_i915_error_engine *ee = &error->engine[i];
1741 if (ee->hangcheck_timestamp &&
1742 time_before(ee->hangcheck_timestamp, epoch))
1743 epoch = ee->hangcheck_timestamp;
1749 static void capture_finish(struct i915_gpu_state *error)
1751 struct i915_ggtt *ggtt = &error->i915->ggtt;
1752 const u64 slot = ggtt->error_capture.start;
1754 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1757 static int capture(void *data)
1759 struct i915_gpu_state *error = data;
1761 error->time = ktime_get_real();
1762 error->boottime = ktime_get_boottime();
1763 error->uptime = ktime_sub(ktime_get(),
1764 error->i915->gt.last_init_time);
1765 error->capture = jiffies;
1767 capture_params(error);
1768 capture_gen_state(error);
1769 capture_uc_state(error);
1770 capture_reg_state(error);
1771 gem_record_fences(error);
1772 gem_record_rings(error);
1773 capture_active_buffers(error);
1774 capture_pinned_buffers(error);
1776 error->overlay = intel_overlay_capture_error_state(error->i915);
1777 error->display = intel_display_capture_error_state(error->i915);
1779 error->epoch = capture_find_epoch(error);
1781 capture_finish(error);
1785 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1787 struct i915_gpu_state *
1788 i915_capture_gpu_state(struct drm_i915_private *i915)
1790 struct i915_gpu_state *error;
1792 /* Check if GPU capture has been disabled */
1793 error = READ_ONCE(i915->gpu_error.first_error);
1797 error = kzalloc(sizeof(*error), GFP_ATOMIC);
1799 i915_disable_error_state(i915, -ENOMEM);
1800 return ERR_PTR(-ENOMEM);
1803 kref_init(&error->ref);
1806 stop_machine(capture, error, NULL);
1812 * i915_capture_error_state - capture an error record for later analysis
1813 * @i915: i915 device
1814 * @engine_mask: the mask of engines triggering the hang
1815 * @msg: a message to insert into the error capture header
1817 * Should be called when an error is detected (either a hang or an error
1818 * interrupt) to capture error state from the time of the error. Fills
1819 * out a structure which becomes available in debugfs for user level tools
1822 void i915_capture_error_state(struct drm_i915_private *i915,
1823 intel_engine_mask_t engine_mask,
1827 struct i915_gpu_state *error;
1828 unsigned long flags;
1830 if (!i915_modparams.error_capture)
1833 if (READ_ONCE(i915->gpu_error.first_error))
1836 error = i915_capture_gpu_state(i915);
1840 dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
1842 if (!error->simulated) {
1843 spin_lock_irqsave(&i915->gpu_error.lock, flags);
1844 if (!i915->gpu_error.first_error) {
1845 i915->gpu_error.first_error = error;
1848 spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1852 __i915_gpu_state_free(&error->ref);
1857 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1858 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1859 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1860 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1861 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1862 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1863 i915->drm.primary->index);
1868 struct i915_gpu_state *
1869 i915_first_error_state(struct drm_i915_private *i915)
1871 struct i915_gpu_state *error;
1873 spin_lock_irq(&i915->gpu_error.lock);
1874 error = i915->gpu_error.first_error;
1875 if (!IS_ERR_OR_NULL(error))
1876 i915_gpu_state_get(error);
1877 spin_unlock_irq(&i915->gpu_error.lock);
1882 void i915_reset_error_state(struct drm_i915_private *i915)
1884 struct i915_gpu_state *error;
1886 spin_lock_irq(&i915->gpu_error.lock);
1887 error = i915->gpu_error.first_error;
1888 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1889 i915->gpu_error.first_error = NULL;
1890 spin_unlock_irq(&i915->gpu_error.lock);
1892 if (!IS_ERR_OR_NULL(error))
1893 i915_gpu_state_put(error);
1896 void i915_disable_error_state(struct drm_i915_private *i915, int err)
1898 spin_lock_irq(&i915->gpu_error.lock);
1899 if (!i915->gpu_error.first_error)
1900 i915->gpu_error.first_error = ERR_PTR(err);
1901 spin_unlock_irq(&i915->gpu_error.lock);