mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-19 09:04:57 +00:00
habanalabs: add information about PCIe controller
Update firmware header with new API for getting pcie info such as tx/rx throughput and replay counter. These counters are needed by customers for monitor and maintenance of multiple devices. Add new opcodes to the INFO ioctl to retrieve these counters. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
a98d73c7fa
commit
0a068adde5
7 changed files with 138 additions and 0 deletions
|
@ -363,6 +363,54 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
|
||||||
|
struct hl_info_pci_counters *counters)
|
||||||
|
{
|
||||||
|
struct armcp_packet pkt = {};
|
||||||
|
long result;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_THROUGHPUT_GET <<
|
||||||
|
ARMCP_PKT_CTL_OPCODE_SHIFT);
|
||||||
|
|
||||||
|
/* Fetch PCI rx counter */
|
||||||
|
pkt.index = cpu_to_le32(armcp_pcie_throughput_rx);
|
||||||
|
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||||
|
HL_ARMCP_INFO_TIMEOUT_USEC, &result);
|
||||||
|
if (rc) {
|
||||||
|
dev_err(hdev->dev,
|
||||||
|
"Failed to handle ArmCP PCI info pkt, error %d\n", rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
counters->rx_throughput = result;
|
||||||
|
|
||||||
|
/* Fetch PCI tx counter */
|
||||||
|
pkt.index = cpu_to_le32(armcp_pcie_throughput_tx);
|
||||||
|
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||||
|
HL_ARMCP_INFO_TIMEOUT_USEC, &result);
|
||||||
|
if (rc) {
|
||||||
|
dev_err(hdev->dev,
|
||||||
|
"Failed to handle ArmCP PCI info pkt, error %d\n", rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
counters->tx_throughput = result;
|
||||||
|
|
||||||
|
/* Fetch PCI replay counter */
|
||||||
|
pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_REPLAY_CNT_GET <<
|
||||||
|
ARMCP_PKT_CTL_OPCODE_SHIFT);
|
||||||
|
|
||||||
|
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
|
||||||
|
HL_ARMCP_INFO_TIMEOUT_USEC, &result);
|
||||||
|
if (rc) {
|
||||||
|
dev_err(hdev->dev,
|
||||||
|
"Failed to handle ArmCP PCI info pkt, error %d\n", rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
counters->replay_cnt = (u32) result;
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
|
static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
|
||||||
{
|
{
|
||||||
u32 err_val;
|
u32 err_val;
|
||||||
|
|
|
@ -1483,6 +1483,7 @@ struct hl_device_idle_busy_ts {
|
||||||
* @soft_reset_cnt: number of soft reset since the driver was loaded.
|
* @soft_reset_cnt: number of soft reset since the driver was loaded.
|
||||||
* @hard_reset_cnt: number of hard reset since the driver was loaded.
|
* @hard_reset_cnt: number of hard reset since the driver was loaded.
|
||||||
* @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
|
* @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
|
||||||
|
* @clk_throttling_reason: bitmask represents the current clk throttling reasons
|
||||||
* @id: device minor.
|
* @id: device minor.
|
||||||
* @id_control: minor of the control device
|
* @id_control: minor of the control device
|
||||||
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
|
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
|
||||||
|
@ -1587,6 +1588,7 @@ struct hl_device {
|
||||||
u32 soft_reset_cnt;
|
u32 soft_reset_cnt;
|
||||||
u32 hard_reset_cnt;
|
u32 hard_reset_cnt;
|
||||||
u32 idle_busy_ts_idx;
|
u32 idle_busy_ts_idx;
|
||||||
|
u32 clk_throttling_reason;
|
||||||
u16 id;
|
u16 id;
|
||||||
u16 id_control;
|
u16 id_control;
|
||||||
u16 cpu_pci_msb_addr;
|
u16 cpu_pci_msb_addr;
|
||||||
|
@ -1841,6 +1843,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
|
||||||
int hl_fw_send_heartbeat(struct hl_device *hdev);
|
int hl_fw_send_heartbeat(struct hl_device *hdev);
|
||||||
int hl_fw_armcp_info_get(struct hl_device *hdev);
|
int hl_fw_armcp_info_get(struct hl_device *hdev);
|
||||||
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
|
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
|
||||||
|
int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
|
||||||
|
struct hl_info_pci_counters *counters);
|
||||||
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
|
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
|
||||||
u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
|
u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
|
||||||
u32 boot_err0_reg, bool skip_bmc,
|
u32 boot_err0_reg, bool skip_bmc,
|
||||||
|
|
|
@ -276,6 +276,41 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
|
||||||
min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
|
min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
|
||||||
|
{
|
||||||
|
struct hl_device *hdev = hpriv->hdev;
|
||||||
|
struct hl_info_pci_counters pci_counters = {0};
|
||||||
|
u32 max_size = args->return_size;
|
||||||
|
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if ((!max_size) || (!out))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
rc = hl_fw_armcp_pci_counters_get(hdev, &pci_counters);
|
||||||
|
if (rc)
|
||||||
|
return rc;
|
||||||
|
|
||||||
|
return copy_to_user(out, &pci_counters,
|
||||||
|
min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
|
||||||
|
{
|
||||||
|
struct hl_device *hdev = hpriv->hdev;
|
||||||
|
struct hl_info_clk_throttle clk_throttle = {0};
|
||||||
|
u32 max_size = args->return_size;
|
||||||
|
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
|
||||||
|
|
||||||
|
if ((!max_size) || (!out))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
|
||||||
|
|
||||||
|
return copy_to_user(out, &clk_throttle,
|
||||||
|
min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
|
static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
|
||||||
{
|
{
|
||||||
struct hl_device *hdev = hpriv->hdev;
|
struct hl_device *hdev = hpriv->hdev;
|
||||||
|
@ -360,6 +395,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
|
||||||
case HL_INFO_CS_COUNTERS:
|
case HL_INFO_CS_COUNTERS:
|
||||||
return cs_counters_info(hpriv, args);
|
return cs_counters_info(hpriv, args);
|
||||||
|
|
||||||
|
case HL_INFO_PCI_COUNTERS:
|
||||||
|
return pci_counters_info(hpriv, args);
|
||||||
|
|
||||||
|
case HL_INFO_CLK_THROTTLE_REASON:
|
||||||
|
return clk_throttle_info(hpriv, args);
|
||||||
|
|
||||||
default:
|
default:
|
||||||
dev_err(dev, "Invalid request %d\n", args->op);
|
dev_err(dev, "Invalid request %d\n", args->op);
|
||||||
rc = -ENOTTY;
|
rc = -ENOTTY;
|
||||||
|
|
|
@ -5653,21 +5653,25 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
|
||||||
{
|
{
|
||||||
switch (event_type) {
|
switch (event_type) {
|
||||||
case GAUDI_EVENT_FIX_POWER_ENV_S:
|
case GAUDI_EVENT_FIX_POWER_ENV_S:
|
||||||
|
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Clock throttling due to power consumption\n");
|
"Clock throttling due to power consumption\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GAUDI_EVENT_FIX_POWER_ENV_E:
|
case GAUDI_EVENT_FIX_POWER_ENV_E:
|
||||||
|
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Power envelop is safe, back to optimal clock\n");
|
"Power envelop is safe, back to optimal clock\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GAUDI_EVENT_FIX_THERMAL_ENV_S:
|
case GAUDI_EVENT_FIX_THERMAL_ENV_S:
|
||||||
|
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Clock throttling due to overheating\n");
|
"Clock throttling due to overheating\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
|
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
|
||||||
|
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Thermal envelop is safe, back to optimal clock\n");
|
"Thermal envelop is safe, back to optimal clock\n");
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -4580,18 +4580,22 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
||||||
{
|
{
|
||||||
switch (event_type) {
|
switch (event_type) {
|
||||||
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
|
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
|
||||||
|
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Clock throttling due to power consumption\n");
|
"Clock throttling due to power consumption\n");
|
||||||
break;
|
break;
|
||||||
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
|
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
|
||||||
|
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Power envelop is safe, back to optimal clock\n");
|
"Power envelop is safe, back to optimal clock\n");
|
||||||
break;
|
break;
|
||||||
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
|
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
|
||||||
|
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Clock throttling due to overheating\n");
|
"Clock throttling due to overheating\n");
|
||||||
break;
|
break;
|
||||||
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
|
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
|
||||||
|
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Thermal envelop is safe, back to optimal clock\n");
|
"Thermal envelop is safe, back to optimal clock\n");
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -243,6 +243,8 @@ enum armcp_packet_id {
|
||||||
ARMCP_PACKET_TEMPERATURE_SET, /* sysfs */
|
ARMCP_PACKET_TEMPERATURE_SET, /* sysfs */
|
||||||
ARMCP_PACKET_VOLTAGE_SET, /* sysfs */
|
ARMCP_PACKET_VOLTAGE_SET, /* sysfs */
|
||||||
ARMCP_PACKET_CURRENT_SET, /* sysfs */
|
ARMCP_PACKET_CURRENT_SET, /* sysfs */
|
||||||
|
ARMCP_PACKET_PCIE_THROUGHPUT_GET, /* internal */
|
||||||
|
ARMCP_PACKET_PCIE_REPLAY_CNT_GET, /* internal */
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ARMCP_PACKET_FENCE_VAL 0xFE8CE7A5
|
#define ARMCP_PACKET_FENCE_VAL 0xFE8CE7A5
|
||||||
|
@ -277,6 +279,9 @@ struct armcp_packet {
|
||||||
__u8 pad; /* unused */
|
__u8 pad; /* unused */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* For any general request */
|
||||||
|
__le32 index;
|
||||||
|
|
||||||
/* For frequency get/set */
|
/* For frequency get/set */
|
||||||
__le32 pll_index;
|
__le32 pll_index;
|
||||||
|
|
||||||
|
@ -344,6 +349,11 @@ enum armcp_pwm_attributes {
|
||||||
armcp_pwm_enable
|
armcp_pwm_enable
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum armcp_pcie_throughput_attributes {
|
||||||
|
armcp_pcie_throughput_tx,
|
||||||
|
armcp_pcie_throughput_rx
|
||||||
|
};
|
||||||
|
|
||||||
/* Event Queue Packets */
|
/* Event Queue Packets */
|
||||||
|
|
||||||
struct eq_generic_event {
|
struct eq_generic_event {
|
||||||
|
|
|
@ -264,6 +264,8 @@ enum hl_device_status {
|
||||||
* HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time
|
* HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time
|
||||||
* for synchronization.
|
* for synchronization.
|
||||||
* HL_INFO_CS_COUNTERS - Retrieve command submission counters
|
* HL_INFO_CS_COUNTERS - Retrieve command submission counters
|
||||||
|
* HL_INFO_PCI_COUNTERS - Retrieve PCI counters
|
||||||
|
* HL_INFO_CLK_THROTTLE_REASON - Retrieve clock throttling reason
|
||||||
*/
|
*/
|
||||||
#define HL_INFO_HW_IP_INFO 0
|
#define HL_INFO_HW_IP_INFO 0
|
||||||
#define HL_INFO_HW_EVENTS 1
|
#define HL_INFO_HW_EVENTS 1
|
||||||
|
@ -276,6 +278,8 @@ enum hl_device_status {
|
||||||
#define HL_INFO_RESET_COUNT 9
|
#define HL_INFO_RESET_COUNT 9
|
||||||
#define HL_INFO_TIME_SYNC 10
|
#define HL_INFO_TIME_SYNC 10
|
||||||
#define HL_INFO_CS_COUNTERS 11
|
#define HL_INFO_CS_COUNTERS 11
|
||||||
|
#define HL_INFO_PCI_COUNTERS 12
|
||||||
|
#define HL_INFO_CLK_THROTTLE_REASON 13
|
||||||
|
|
||||||
#define HL_INFO_VERSION_MAX_LEN 128
|
#define HL_INFO_VERSION_MAX_LEN 128
|
||||||
#define HL_INFO_CARD_NAME_MAX_LEN 16
|
#define HL_INFO_CARD_NAME_MAX_LEN 16
|
||||||
|
@ -340,6 +344,29 @@ struct hl_info_time_sync {
|
||||||
__u64 host_time;
|
__u64 host_time;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct hl_info_pci_counters - pci counters
|
||||||
|
* @rx_throughput: PCI rx throughput KBps
|
||||||
|
* @tx_throughput: PCI tx throughput KBps
|
||||||
|
* @replay_cnt: PCI replay counter
|
||||||
|
*/
|
||||||
|
struct hl_info_pci_counters {
|
||||||
|
__u64 rx_throughput;
|
||||||
|
__u64 tx_throughput;
|
||||||
|
__u64 replay_cnt;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define HL_CLK_THROTTLE_POWER 0x1
|
||||||
|
#define HL_CLK_THROTTLE_THERMAL 0x2
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct hl_info_clk_throttle - clock throttling reason
|
||||||
|
* @clk_throttling_reason: each bit represents a clk throttling reason
|
||||||
|
*/
|
||||||
|
struct hl_info_clk_throttle {
|
||||||
|
__u32 clk_throttling_reason;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct hl_info_cs_counters - command submission counters
|
* struct hl_info_cs_counters - command submission counters
|
||||||
* @out_of_mem_drop_cnt: dropped due to memory allocation issue
|
* @out_of_mem_drop_cnt: dropped due to memory allocation issue
|
||||||
|
|
Loading…
Reference in a new issue