diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig index 62f54e6d62d9..0a4854a59c90 100644 --- a/drivers/gpu/drm/xe/Kconfig +++ b/drivers/gpu/drm/xe/Kconfig @@ -23,6 +23,7 @@ config DRM_XE select DRM_TTM_HELPER select DRM_SCHED select MMU_NOTIFIER + select WANT_DEV_COREDUMP help Experimental driver for Intel Xe series GPUs diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 71c604ecff53..5d277d060eba 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -37,6 +37,7 @@ xe-y += xe_bb.o \ xe_bo.o \ xe_bo_evict.o \ xe_debugfs.o \ + xe_devcoredump.o \ xe_device.o \ xe_dma_buf.o \ xe_engine.o \ diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c new file mode 100644 index 000000000000..561db73a3c8c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_devcoredump.h" +#include "xe_devcoredump_types.h" + +#include +#include + +#include "xe_engine.h" +#include "xe_gt.h" + +/** + * DOC: Xe device coredump + * + * Devices overview: + * Xe uses dev_coredump infrastructure for exposing the crash errors in a + * standardized way. + * devcoredump exposes a temporary device under /sys/class/devcoredump/ + * which is linked with our card device directly. + * The core dump can be accessed either from + * /sys/class/drm/card/device/devcoredump/ or from + * /sys/class/devcoredump/devcd where + * /sys/class/devcoredump/devcd/failing_device is a link to + * /sys/class/drm/card/device/. + * + * Snapshot at hang: + * The 'data' file is printed with a drm_printer pointer at devcoredump read + * time. For this reason, we need to take snapshots from when the hang has + * happened, and not only when the user is reading the file. Otherwise the + * information is outdated since the resets might have happened in between. + * + * 'First' failure snapshot: + * In general, the first hang is the most critical one since the following hangs + * can be a consequence of the initial hang. For this reason we only take the + * snapshot of the 'first' failure and ignore subsequent calls of this function, + * at least while the coredump device is alive. Dev_coredump has a delayed work + * queue that will eventually delete the device and free all the dump + * information. + */ + +#ifdef CONFIG_DEV_COREDUMP + +static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump) +{ + return container_of(coredump, struct xe_device, devcoredump); +} + +static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, + size_t count, void *data, size_t datalen) +{ + struct xe_devcoredump *coredump = data; + struct xe_devcoredump_snapshot *ss; + struct drm_printer p; + struct drm_print_iterator iter; + struct timespec64 ts; + + iter.data = buffer; + iter.offset = 0; + iter.start = offset; + iter.remain = count; + + ss = &coredump->snapshot; + p = drm_coredump_printer(&iter); + + drm_printf(&p, "**** Xe Device Coredump ****\n"); + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); + + ts = ktime_to_timespec64(ss->snapshot_time); + drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec); + ts = ktime_to_timespec64(ss->boot_time); + drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec); + + return count - iter.remain; +} + +static void xe_devcoredump_free(void *data) +{ + struct xe_devcoredump *coredump = data; + + coredump->captured = false; + drm_info(&coredump_to_xe(coredump)->drm, + "Xe device coredump has been deleted.\n"); +} + +static void devcoredump_snapshot(struct xe_devcoredump *coredump, + struct xe_engine *e) +{ + struct xe_devcoredump_snapshot *ss = &coredump->snapshot; + + ss->snapshot_time = ktime_get_real(); + ss->boot_time = ktime_get_boottime(); +} + +/** + * xe_devcoredump - Take the required snapshots and initialize coredump device. + * @e: The faulty xe_engine, where the issue was detected. + * + * This function should be called at the crash time within the serialized + * gt_reset. It is skipped if we still have the core dump device available + * with the information of the 'first' snapshot. + */ +void xe_devcoredump(struct xe_engine *e) +{ + struct xe_device *xe = gt_to_xe(e->gt); + struct xe_devcoredump *coredump = &xe->devcoredump; + + if (coredump->captured) { + drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n"); + return; + } + + coredump->captured = true; + devcoredump_snapshot(coredump, e); + + drm_info(&xe->drm, "Xe device coredump has been created\n"); + drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n", + xe->drm.primary->index); + + dev_coredumpm(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL, + xe_devcoredump_read, xe_devcoredump_free); +} +#endif diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h new file mode 100644 index 000000000000..854882129227 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_devcoredump.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DEVCOREDUMP_H_ +#define _XE_DEVCOREDUMP_H_ + +struct xe_device; +struct xe_engine; + +#ifdef CONFIG_DEV_COREDUMP +void xe_devcoredump(struct xe_engine *e); +#else +static inline void xe_devcoredump(struct xe_engine *e) +{ +} +#endif + +#endif diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h new file mode 100644 index 000000000000..52bd27ca1036 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DEVCOREDUMP_TYPES_H_ +#define _XE_DEVCOREDUMP_TYPES_H_ + +#include +#include + +struct xe_device; + +/** + * struct xe_devcoredump_snapshot - Crash snapshot + * + * This struct contains all the useful information quickly captured at the time + * of the crash. So, any subsequent reads of the coredump points to a data that + * shows the state of the GPU of when the issue has happened. + */ +struct xe_devcoredump_snapshot { + /** @snapshot_time: Time of this capture. */ + ktime_t snapshot_time; + /** @boot_time: Relative boot time so the uptime can be calculated. */ + ktime_t boot_time; +}; + +/** + * struct xe_devcoredump - Xe devcoredump main structure + * + * This struct represents the live and active dev_coredump node. + * It is created/populated at the time of a crash/error. Then it + * is read later when user access the device coredump data file + * for reading the information. + */ +struct xe_devcoredump { + /** @xe: Xe device. */ + struct xe_device *xe; + /** @captured: The snapshot of the first hang has already been taken. */ + bool captured; + /** @snapshot: Snapshot is captured at time of the first crash */ + struct xe_devcoredump_snapshot snapshot; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index f3cf5a4e5ab2..91edbe4a3730 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -12,6 +12,7 @@ #include #include +#include "xe_devcoredump_types.h" #include "xe_gt_types.h" #include "xe_platform_types.h" #include "xe_step_types.h" @@ -49,6 +50,9 @@ struct xe_device { /** @drm: drm device */ struct drm_device drm; + /** @devcoredump: device coredump */ + struct xe_devcoredump devcoredump; + /** @info: device info */ struct intel_device_info { /** @graphics_name: graphics IP name */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index d0b48c885fda..55b51ff791b8 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -14,6 +14,7 @@ #include #include "regs/xe_lrc_layout.h" +#include "xe_devcoredump.h" #include "xe_device.h" #include "xe_engine.h" #include "xe_force_wake.h" @@ -800,6 +801,7 @@ guc_engine_timedout_job(struct drm_sched_job *drm_job) drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx", xe_sched_job_seqno(job), e->guc->id, e->flags); simple_error_capture(e); + xe_devcoredump(e); } else { drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx", xe_sched_job_seqno(job), e->guc->id, e->flags);