accel/habanalabs/gaudi2: add eq health check using irq

This is the second patch for applying the eq health check mechanism
which will add support for the interrupt flow for gaudi2 asic.

More info about the interrupt mechanism:
set a dedicated msix for the eq error interrupt, and add
interrupt handler for it.
when FW detects some issue with EQ like EQ_FULL, it'll
raise that interrupt and driver should reset the device.
Driver will inform the FW which msix index to use through
the already existing handshake mechanism which will
send msix info message to fw.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
farah kassabri 2023-08-23 12:36:25 +03:00 committed by Oded Gabbay
parent 7c4130e6dd
commit 764bfd138f
5 changed files with 31 additions and 0 deletions

View file

@ -3689,6 +3689,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg);
irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg);
irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg);
irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg);
u32 hl_cq_inc_ptr(u32 ptr);
int hl_asid_init(struct hl_device *hdev);

View file

@ -401,6 +401,18 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
return IRQ_HANDLED;
}
irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg)
{
u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
struct hl_device *hdev = arg;
dev_err(hdev->dev, "EQ error interrupt received\n");
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
return IRQ_HANDLED;
}
/**
* hl_irq_handler_eq - irq handler for event queue
*

View file

@ -4175,6 +4175,8 @@ static const char *gaudi2_irq_name(u16 irq_number)
return "gaudi2 unexpected error";
case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
return "gaudi2 user completion";
case GAUDI2_IRQ_NUM_EQ_ERROR:
return "gaudi2 eq error";
default:
return "invalid";
}
@ -4317,6 +4319,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
}
}
irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
rc = request_threaded_irq(irq, NULL, hl_irq_eq_error_interrupt_thread_handler,
IRQF_ONESHOT, gaudi2_irq_name(GAUDI2_IRQ_NUM_EQ_ERROR),
hdev);
if (rc) {
dev_err(hdev->dev, "Failed to request IRQ %d", irq);
goto free_user_irq;
}
gaudi2->hw_cap_initialized |= HW_CAP_MSIX;
return 0;
@ -4376,6 +4387,7 @@ static void gaudi2_sync_irqs(struct hl_device *hdev)
}
synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE));
synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR));
}
static void gaudi2_disable_msix(struct hl_device *hdev)
@ -4412,6 +4424,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
cq = &hdev->completion_queue[GAUDI2_RESERVED_CQ_CS_COMPLETION];
free_irq(irq, cq);
irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
free_irq(irq, hdev);
pci_free_irq_vectors(hdev->pdev);
gaudi2->hw_cap_initialized &= ~HW_CAP_MSIX;
@ -11345,6 +11360,7 @@ static int gaudi2_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64
static void gaudi2_get_msi_info(__le32 *table)
{
table[CPUCP_EVENT_QUEUE_MSI_TYPE] = cpu_to_le32(GAUDI2_EVENT_QUEUE_MSIX_IDX);
table[CPUCP_EVENT_QUEUE_ERR_MSI_TYPE] = cpu_to_le32(GAUDI2_IRQ_NUM_EQ_ERROR);
}
static int gaudi2_map_pll_idx_to_fw_idx(u32 pll_idx)

View file

@ -419,6 +419,7 @@ enum gaudi2_irq_num {
GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
GAUDI2_IRQ_NUM_TPC_ASSERT,
GAUDI2_IRQ_NUM_EQ_ERROR,
GAUDI2_IRQ_NUM_RESERVED_FIRST,
GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,

View file

@ -1004,6 +1004,7 @@ enum cpucp_msi_type {
CPUCP_NIC_PORT5_MSI_TYPE,
CPUCP_NIC_PORT7_MSI_TYPE,
CPUCP_NIC_PORT9_MSI_TYPE,
CPUCP_EVENT_QUEUE_ERR_MSI_TYPE,
CPUCP_NUM_OF_MSI_TYPES
};