drm/amdgpu: add ras POSION_CONSUMPTION event id support
add amdgpu ras POSION_CONSUMPTION event id support. Signed-off-by: Yang Wang <kevinyang.wang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
91ba536ead
commit
12b435a40c
@ -2076,10 +2076,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
|
||||
struct amdgpu_ras_block_object *block_obj =
|
||||
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
|
||||
u64 event_id;
|
||||
int ret;
|
||||
|
||||
if (!block_obj || !con)
|
||||
return;
|
||||
|
||||
ret = amdgpu_ras_mark_ras_event(adev, type);
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
/* both query_poison_status and handle_poison_consumption are optional,
|
||||
* but at least one of them should be implemented if we need poison
|
||||
* consumption handler
|
||||
@ -2104,8 +2111,10 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
|
||||
* For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
|
||||
*/
|
||||
if (poison_stat && !con->is_rma) {
|
||||
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
|
||||
block_obj->ras_comm.name);
|
||||
event_id = amdgpu_ras_acquire_event_id(adev, type);
|
||||
RAS_EVENT_LOG(adev, event_id,
|
||||
"GPU reset for %s RAS poison consumption is issued!\n",
|
||||
block_obj->ras_comm.name);
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
@ -2498,7 +2507,7 @@ static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device
|
||||
if (amdgpu_ras_intr_triggered())
|
||||
return RAS_EVENT_TYPE_FATAL;
|
||||
else
|
||||
return RAS_EVENT_TYPE_INVALID;
|
||||
return RAS_EVENT_TYPE_POISON_CONSUMPTION;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
@ -3986,6 +3995,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
|
||||
switch (type) {
|
||||
case RAS_EVENT_TYPE_FATAL:
|
||||
case RAS_EVENT_TYPE_POISON_CREATION:
|
||||
case RAS_EVENT_TYPE_POISON_CONSUMPTION:
|
||||
event_mgr = __get_ras_event_mgr(adev);
|
||||
if (!event_mgr)
|
||||
return RAS_EVENT_INVALID_ID;
|
||||
|
@ -436,6 +436,7 @@ enum ras_event_type {
|
||||
RAS_EVENT_TYPE_INVALID = 0,
|
||||
RAS_EVENT_TYPE_FATAL,
|
||||
RAS_EVENT_TYPE_POISON_CREATION,
|
||||
RAS_EVENT_TYPE_POISON_CONSUMPTION,
|
||||
RAS_EVENT_TYPE_COUNT,
|
||||
};
|
||||
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "soc15_int.h"
|
||||
#include "kfd_device_queue_manager.h"
|
||||
#include "kfd_smi_events.h"
|
||||
#include "amdgpu_ras.h"
|
||||
|
||||
/*
|
||||
* GFX9 SQ Interrupts
|
||||
@ -144,9 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
||||
uint16_t pasid, uint16_t client_id)
|
||||
{
|
||||
enum amdgpu_ras_block block = 0;
|
||||
int old_poison;
|
||||
uint32_t reset = 0;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
|
||||
u64 event_id;
|
||||
int old_poison, ret;
|
||||
|
||||
if (!p)
|
||||
return;
|
||||
@ -193,10 +196,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
||||
return;
|
||||
}
|
||||
|
||||
ret = amdgpu_ras_mark_ras_event(dev->adev, type);
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
kfd_signal_poison_consumed_event(dev, pasid);
|
||||
|
||||
dev_warn(dev->adev->dev,
|
||||
"poison is consumed by client %d, kick off gpu reset flow\n", client_id);
|
||||
event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
|
||||
|
||||
RAS_EVENT_LOG(dev->adev, event_id,
|
||||
"poison is consumed by client %d, kick off gpu reset flow\n", client_id);
|
||||
|
||||
amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
|
||||
block, pasid, NULL, NULL, reset);
|
||||
|
Loading…
x
Reference in New Issue
Block a user