mirror of
https://github.com/torvalds/linux.git
synced 2026-06-03 12:03:54 +02:00
habanalabs: reset device in case of sync error
As the F/wW is the first to detect out of sync event, a new event is added to notify the driver on such event. In which case the driver performs hard reset. Signed-off-by: Ohad Sharabi <osharabi@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
17b59dd339
commit
5d6a198f9d
|
|
@ -7097,6 +7097,15 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
|
|||
}
|
||||
}
|
||||
|
||||
static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
|
||||
struct cpucp_pkt_sync_err *sync_err)
|
||||
{
|
||||
struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ];
|
||||
|
||||
dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
|
||||
sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
|
||||
}
|
||||
|
||||
static int gaudi_soft_reset_late_init(struct hl_device *hdev)
|
||||
{
|
||||
struct gaudi_device *gaudi = hdev->asic_specific;
|
||||
|
|
@ -7552,6 +7561,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
|
|||
event_type, cause);
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
|
||||
if (hdev->hard_reset_on_fw_events)
|
||||
hl_device_reset(hdev, true, false);
|
||||
else
|
||||
hl_fw_unmask_irq(hdev, event_type);
|
||||
break;
|
||||
|
||||
default:
|
||||
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
|
||||
event_type);
|
||||
|
|
|
|||
|
|
@ -4401,6 +4401,8 @@ static const char *_goya_get_event_desc(u16 event_type)
|
|||
return "THERMAL_ENV_S";
|
||||
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
|
||||
return "THERMAL_ENV_E";
|
||||
case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC:
|
||||
return "QUEUE_OUT_OF_SYNC";
|
||||
default:
|
||||
return "N/A";
|
||||
}
|
||||
|
|
@ -4483,6 +4485,9 @@ static void goya_get_event_desc(u16 event_type, char *desc, size_t size)
|
|||
index = event_type - GOYA_ASYNC_EVENT_ID_DMA_BM_CH0;
|
||||
snprintf(desc, size, _goya_get_event_desc(event_type), index);
|
||||
break;
|
||||
case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC:
|
||||
snprintf(desc, size, _goya_get_event_desc(event_type));
|
||||
break;
|
||||
default:
|
||||
snprintf(desc, size, _goya_get_event_desc(event_type));
|
||||
break;
|
||||
|
|
@ -4534,6 +4539,15 @@ static void goya_print_mmu_error_info(struct hl_device *hdev)
|
|||
}
|
||||
}
|
||||
|
||||
static void goya_print_out_of_sync_info(struct hl_device *hdev,
|
||||
struct cpucp_pkt_sync_err *sync_err)
|
||||
{
|
||||
struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
|
||||
|
||||
dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
|
||||
sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
|
||||
}
|
||||
|
||||
static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
|
||||
bool razwi)
|
||||
{
|
||||
|
|
@ -4754,6 +4768,15 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
|
|||
goya_unmask_irq(hdev, event_type);
|
||||
break;
|
||||
|
||||
case GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC:
|
||||
goya_print_irq_info(hdev, event_type, false);
|
||||
goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
|
||||
if (hdev->hard_reset_on_fw_events)
|
||||
hl_device_reset(hdev, true, false);
|
||||
else
|
||||
hl_fw_unmask_irq(hdev, event_type);
|
||||
break;
|
||||
|
||||
default:
|
||||
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
|
||||
event_type);
|
||||
|
|
|
|||
|
|
@ -28,6 +28,14 @@
|
|||
#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT 6
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK 0x000007C0
|
||||
|
||||
/*
|
||||
* info of the pkt queue pointers in the first async occurrence
|
||||
*/
|
||||
struct cpucp_pkt_sync_err {
|
||||
__le32 pi;
|
||||
__le32 ci;
|
||||
};
|
||||
|
||||
struct hl_eq_hbm_ecc_data {
|
||||
/* SERR counter */
|
||||
__le32 sec_cnt;
|
||||
|
|
@ -77,6 +85,7 @@ struct hl_eq_entry {
|
|||
struct hl_eq_ecc_data ecc_data;
|
||||
struct hl_eq_hbm_ecc_data hbm_ecc_data;
|
||||
struct hl_eq_sm_sei_data sm_sei_data;
|
||||
struct cpucp_pkt_sync_err pkt_sync_err;
|
||||
__le64 data[7];
|
||||
};
|
||||
};
|
||||
|
|
|
|||
|
|
@ -303,6 +303,7 @@ enum gaudi_async_event_id {
|
|||
GAUDI_EVENT_NIC3_QP1 = 619,
|
||||
GAUDI_EVENT_NIC4_QP0 = 620,
|
||||
GAUDI_EVENT_NIC4_QP1 = 621,
|
||||
GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647,
|
||||
GAUDI_EVENT_FIX_POWER_ENV_S = 658,
|
||||
GAUDI_EVENT_FIX_POWER_ENV_E = 659,
|
||||
GAUDI_EVENT_FIX_THERMAL_ENV_S = 660,
|
||||
|
|
|
|||
|
|
@ -188,6 +188,7 @@ enum goya_async_event_id {
|
|||
GOYA_ASYNC_EVENT_ID_HALT_MACHINE = 485,
|
||||
GOYA_ASYNC_EVENT_ID_INTS_REGISTER = 486,
|
||||
GOYA_ASYNC_EVENT_ID_SOFT_RESET = 487,
|
||||
GOYA_ASYNC_EVENT_PKT_QUEUE_OUT_SYNC = 506,
|
||||
GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S = 507,
|
||||
GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E = 508,
|
||||
GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S = 509,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user