habanalabs/gaudi2: add device unavailable notification
Device unavailable notifies the user that there isn't an option to retrieve debug information from the device. When a critical device error occurs and the f/w performs the device reset, a device unavailable notification shall be sent to the user process. Signed-off-by: Tal Cohen <talcohen@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
16448d6444
commit
5731b6e6f0
@ -8576,7 +8576,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
|
|||||||
{
|
{
|
||||||
u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY;
|
u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY;
|
||||||
struct gaudi2_device *gaudi2 = hdev->asic_specific;
|
struct gaudi2_device *gaudi2 = hdev->asic_specific;
|
||||||
bool reset_required = false, skip_reset = false;
|
bool reset_required = false, skip_reset = false, is_critical = false;
|
||||||
int index, sbte_index;
|
int index, sbte_index;
|
||||||
u64 event_mask = 0;
|
u64 event_mask = 0;
|
||||||
u16 event_type;
|
u16 event_type;
|
||||||
@ -8602,6 +8602,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
|
|||||||
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
|
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
|
||||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||||
reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
|
reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
|
||||||
|
is_critical = eq_entry->ecc_data.is_critical;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM:
|
case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM:
|
||||||
@ -8976,9 +8977,16 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
reset_device:
|
reset_device:
|
||||||
if (hdev->hard_reset_on_fw_events) {
|
if (hdev->asic_prop.fw_security_enabled && is_critical) {
|
||||||
|
reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
|
||||||
|
|
||||||
|
/* notify on device unavailable while the reset triggered by fw */
|
||||||
|
event_mask |= (HL_NOTIFIER_EVENT_DEVICE_RESET |
|
||||||
|
HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE);
|
||||||
hl_device_reset(hdev, reset_flags);
|
hl_device_reset(hdev, reset_flags);
|
||||||
|
} else if (hdev->hard_reset_on_fw_events) {
|
||||||
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
|
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
|
||||||
|
hl_device_reset(hdev, reset_flags);
|
||||||
} else {
|
} else {
|
||||||
if (!gaudi2_irq_map_table[event_type].msg)
|
if (!gaudi2_irq_map_table[event_type].msg)
|
||||||
hl_fw_unmask_irq(hdev, event_type);
|
hl_fw_unmask_irq(hdev, event_type);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user