This tag contains habanalabs driver and accel changes for v6.4:

- uAPI changes:
 
   - Add opcodes to the CS ioctl to allow user to stall/resume specific engines
     inside Gaudi2. This is to allow the user to perform power
     testing/measurements when training different topologies.
 
   - Expose in the INFO ioctl the amount of device memory that the driver
     and f/w reserve for themselves.
 
   - Expose in the INFO ioctl a bit-mask of the available rotator engines
     in Gaudi2. This is to align with other engines that are already exposed.
 
   - Expose in the INFO ioctl the register's address of the f/w that should
     be used to trigger interrupts from within the user's code running in the
     compute engines.
 
   - Add a critical-event bit in the eventfd bitmask so the user will know the
     event that was received was critical, and a reset will now occur
 
   - Expose in the INFO ioctl two new opcodes to fetch information on h/w and
     f/w events. The events recorded are the events that were reported in the
     eventfd.
 
 - New features and improvements:
 
   - Add a dedicated interrupt ID in MSI-X in the device to the notification of
     an unexpected user-related event in Gaudi2. Handle it in the driver by
     reporting this event.
 
   - Allow the user to fetch the device memory current usage even when the
     device is undergoing compute-reset (a reset type that only clears the
     compute engines).
 
   - Enable graceful reset mechanism for compute-reset. This will give the
     user a few seconds before the device is reset. For example, the user can,
     during that time, perform certain device operations (dump data for debug)
     or close the device in an orderly fashion.
 
   - Align the decoder with the rest of the engines in regard to notification
     to the user about interrupts and in regard to performing graceful reset
     when needed (instead of immediate reset).
 
   - Add support for assert interrupt from the TPC engine.
 
   - Get the reset type that is necessary to perform per event from the
     auto-generated irq_map array.
 
   - Print the specific reason why a device is still in use when notifying to
     the user about it (after the user closed the device's FD).
 
   - Move to threaded IRQ when handling interrupts of workload completions.
 
 - Firmware related fixes:
 
   - Fix RAZWI event handler to match newest f/w version.
 
   - Read error cause register in dma core events because the f/w doesn't
     do that.
 
   - Increase maximum time to wait for completion of Gaudi2 reset due to f/w
     bug.
 
   - Align to the latest firmware specs.
 
 - Enforce the release order of the compute device and dma-buf.
   i.e increment the device file refcount for any dma-buf that was exported
   for that device. This will make sure the compute device release function
   won't be called until the user closes all the FDs of the relevant
   dma-bufs. Without this change, closing the device's FD before/without
   closing the dma-buf's FD would always lead to hard-reset of the device.
 
 - Fix a link in the drm documentation to correctly point to the accel section.
 
 - Compilation warnings cleanups
 
 - Misc bug fixes and code cleanups
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCgAdFiEE7TEboABC71LctBLFZR1NuKta54AFAmQYfcAACgkQZR1NuKta
 54DB4Af/SuiHZkVXwr+yHPv9El726rz9ZQD7mQtzNmehWGonwAvz15yqocNMUSbF
 JbqE/vrZjvbXrP1Uv5UrlRVdnFHSPV18VnHU4BMS/WOm19SsR6vZ0QOXOoa6/AUb
 w+kF3D//DbFI4/mTGfpH5/pzwu51ti8aVktosPFlHIa8iI8CB4/4IV+ivQ8UW4oK
 HyDRkIvHdRmER7vGOfhwhsr4zdqSlJBYrv3C3Z1dkSYBPW/5ICbiM1UlKycwdYKI
 cajQBSdUQwUCWnI+i8RmSy3kjNO6OE4XRUvTv89F2bQeyK/1rJLG2m2xZR/Ml/o5
 7Cgvbn0hWZyeqe7OObYiBlSOBSehCA==
 =wclm
 -----END PGP SIGNATURE-----

Merge tag 'drm-habanalabs-next-2023-03-20' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next

This tag contains habanalabs driver and accel changes for v6.4:

- uAPI changes:

  - Add opcodes to the CS ioctl to allow user to stall/resume specific engines
    inside Gaudi2. This is to allow the user to perform power
    testing/measurements when training different topologies.

  - Expose in the INFO ioctl the amount of device memory that the driver
    and f/w reserve for themselves.

  - Expose in the INFO ioctl a bit-mask of the available rotator engines
    in Gaudi2. This is to align with other engines that are already exposed.

  - Expose in the INFO ioctl the register's address of the f/w that should
    be used to trigger interrupts from within the user's code running in the
    compute engines.

  - Add a critical-event bit in the eventfd bitmask so the user will know the
    event that was received was critical, and a reset will now occur

  - Expose in the INFO ioctl two new opcodes to fetch information on h/w and
    f/w events. The events recorded are the events that were reported in the
    eventfd.

- New features and improvements:

  - Add a dedicated interrupt ID in MSI-X in the device to the notification of
    an unexpected user-related event in Gaudi2. Handle it in the driver by
    reporting this event.

  - Allow the user to fetch the device memory current usage even when the
    device is undergoing compute-reset (a reset type that only clears the
    compute engines).

  - Enable graceful reset mechanism for compute-reset. This will give the
    user a few seconds before the device is reset. For example, the user can,
    during that time, perform certain device operations (dump data for debug)
    or close the device in an orderly fashion.

  - Align the decoder with the rest of the engines in regard to notification
    to the user about interrupts and in regard to performing graceful reset
    when needed (instead of immediate reset).

  - Add support for assert interrupt from the TPC engine.

  - Get the reset type that is necessary to perform per event from the
    auto-generated irq_map array.

  - Print the specific reason why a device is still in use when notifying to
    the user about it (after the user closed the device's FD).

  - Move to threaded IRQ when handling interrupts of workload completions.

- Firmware related fixes:

  - Fix RAZWI event handler to match newest f/w version.

  - Read error cause register in dma core events because the f/w doesn't
    do that.

  - Increase maximum time to wait for completion of Gaudi2 reset due to f/w
    bug.

  - Align to the latest firmware specs.

- Enforce the release order of the compute device and dma-buf.
  i.e increment the device file refcount for any dma-buf that was exported
  for that device. This will make sure the compute device release function
  won't be called until the user closes all the FDs of the relevant
  dma-bufs. Without this change, closing the device's FD before/without
  closing the dma-buf's FD would always lead to hard-reset of the device.

- Fix a link in the drm documentation to correctly point to the accel section.

- Compilation warnings cleanups

- Misc bug fixes and code cleanups

Signed-off-by: Dave Airlie <airlied@redhat.com>

# -----BEGIN PGP SIGNATURE-----
#
# iQEzBAABCgAdFiEE7TEboABC71LctBLFZR1NuKta54AFAmQYfcAACgkQZR1NuKta
# 54DB4Af/SuiHZkVXwr+yHPv9El726rz9ZQD7mQtzNmehWGonwAvz15yqocNMUSbF
# JbqE/vrZjvbXrP1Uv5UrlRVdnFHSPV18VnHU4BMS/WOm19SsR6vZ0QOXOoa6/AUb
# w+kF3D//DbFI4/mTGfpH5/pzwu51ti8aVktosPFlHIa8iI8CB4/4IV+ivQ8UW4oK
# HyDRkIvHdRmER7vGOfhwhsr4zdqSlJBYrv3C3Z1dkSYBPW/5ICbiM1UlKycwdYKI
# cajQBSdUQwUCWnI+i8RmSy3kjNO6OE4XRUvTv89F2bQeyK/1rJLG2m2xZR/Ml/o5
# 7Cgvbn0hWZyeqe7OObYiBlSOBSehCA==
# =wclm
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 21 Mar 2023 01:37:36 AEST
# gpg:                using RSA key ED311BA00042EF52DCB412C5651D4DB8AB5AE780
# gpg: Can't check signature: No public key
From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20230320154026.GA766126@ogabbay-vm-u20.habana-labs.com
This commit is contained in:
Dave Airlie 2023-03-22 10:35:45 +10:00
commit d36d68fd19
30 changed files with 4699 additions and 3781 deletions

View file

@ -14,10 +14,10 @@
#define HL_CS_FLAGS_TYPE_MASK (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \ #define HL_CS_FLAGS_TYPE_MASK (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \ HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \
HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND | \ HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND | \
HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES) HL_CS_FLAGS_ENGINES_COMMAND | HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
#define MAX_TS_ITER_NUM 10 #define MAX_TS_ITER_NUM 100
/** /**
* enum hl_cs_wait_status - cs wait status * enum hl_cs_wait_status - cs wait status
@ -657,7 +657,7 @@ static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
/* /*
* we get refcount upon reservation of signals or signal/wait cs for the * we get refcount upon reservation of signals or signal/wait cs for the
* hw_sob object, and need to put it when the first staged cs * hw_sob object, and need to put it when the first staged cs
* (which cotains the encaps signals) or cs signal/wait is completed. * (which contains the encaps signals) or cs signal/wait is completed.
*/ */
if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) || if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
(hl_cs_cmpl->type == CS_TYPE_WAIT) || (hl_cs_cmpl->type == CS_TYPE_WAIT) ||
@ -1082,9 +1082,8 @@ static void
wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt) wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
{ {
struct hl_user_pending_interrupt *pend, *temp; struct hl_user_pending_interrupt *pend, *temp;
unsigned long flags;
spin_lock_irqsave(&interrupt->wait_list_lock, flags); spin_lock(&interrupt->wait_list_lock);
list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) { list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) {
if (pend->ts_reg_info.buf) { if (pend->ts_reg_info.buf) {
list_del(&pend->wait_list_node); list_del(&pend->wait_list_node);
@ -1095,7 +1094,7 @@ wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
complete_all(&pend->fence.completion); complete_all(&pend->fence.completion);
} }
} }
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
} }
void hl_release_pending_user_interrupts(struct hl_device *hdev) void hl_release_pending_user_interrupts(struct hl_device *hdev)
@ -1168,6 +1167,22 @@ static void cs_completion(struct work_struct *work)
hl_complete_job(hdev, job); hl_complete_job(hdev, job);
} }
u32 hl_get_active_cs_num(struct hl_device *hdev)
{
u32 active_cs_num = 0;
struct hl_cs *cs;
spin_lock(&hdev->cs_mirror_lock);
list_for_each_entry(cs, &hdev->cs_mirror_list, mirror_node)
if (!cs->completed)
active_cs_num++;
spin_unlock(&hdev->cs_mirror_lock);
return active_cs_num;
}
static int validate_queue_index(struct hl_device *hdev, static int validate_queue_index(struct hl_device *hdev,
struct hl_cs_chunk *chunk, struct hl_cs_chunk *chunk,
enum hl_queue_type *queue_type, enum hl_queue_type *queue_type,
@ -1304,6 +1319,8 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
return CS_UNRESERVE_SIGNALS; return CS_UNRESERVE_SIGNALS;
else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND) else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
return CS_TYPE_ENGINE_CORE; return CS_TYPE_ENGINE_CORE;
else if (cs_type_flags & HL_CS_FLAGS_ENGINES_COMMAND)
return CS_TYPE_ENGINES;
else if (cs_type_flags & HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES) else if (cs_type_flags & HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
return CS_TYPE_FLUSH_PCI_HBW_WRITES; return CS_TYPE_FLUSH_PCI_HBW_WRITES;
else else
@ -2429,10 +2446,13 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores, static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
u32 num_engine_cores, u32 core_command) u32 num_engine_cores, u32 core_command)
{ {
int rc;
struct hl_device *hdev = hpriv->hdev; struct hl_device *hdev = hpriv->hdev;
void __user *engine_cores_arr; void __user *engine_cores_arr;
u32 *cores; u32 *cores;
int rc;
if (!hdev->asic_prop.supports_engine_modes)
return -EPERM;
if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) { if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) {
dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores); dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
@ -2461,6 +2481,48 @@ static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
return rc; return rc;
} }
static int cs_ioctl_engines(struct hl_fpriv *hpriv, u64 engines_arr_user_addr,
u32 num_engines, enum hl_engine_command command)
{
struct hl_device *hdev = hpriv->hdev;
u32 *engines, max_num_of_engines;
void __user *engines_arr;
int rc;
if (!hdev->asic_prop.supports_engine_modes)
return -EPERM;
if (command >= HL_ENGINE_COMMAND_MAX) {
dev_err(hdev->dev, "Engine command is invalid\n");
return -EINVAL;
}
max_num_of_engines = hdev->asic_prop.max_num_of_engines;
if (command == HL_ENGINE_CORE_RUN || command == HL_ENGINE_CORE_HALT)
max_num_of_engines = hdev->asic_prop.num_engine_cores;
if (!num_engines || num_engines > max_num_of_engines) {
dev_err(hdev->dev, "Number of engines %d is invalid\n", num_engines);
return -EINVAL;
}
engines_arr = (void __user *) (uintptr_t) engines_arr_user_addr;
engines = kmalloc_array(num_engines, sizeof(u32), GFP_KERNEL);
if (!engines)
return -ENOMEM;
if (copy_from_user(engines, engines_arr, num_engines * sizeof(u32))) {
dev_err(hdev->dev, "Failed to copy engine-ids array from user\n");
kfree(engines);
return -EFAULT;
}
rc = hdev->asic_funcs->set_engines(hdev, engines, num_engines, command);
kfree(engines);
return rc;
}
static int cs_ioctl_flush_pci_hbw_writes(struct hl_fpriv *hpriv) static int cs_ioctl_flush_pci_hbw_writes(struct hl_fpriv *hpriv)
{ {
struct hl_device *hdev = hpriv->hdev; struct hl_device *hdev = hpriv->hdev;
@ -2532,6 +2594,10 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores, rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores,
args->in.num_engine_cores, args->in.core_command); args->in.num_engine_cores, args->in.core_command);
break; break;
case CS_TYPE_ENGINES:
rc = cs_ioctl_engines(hpriv, args->in.engines,
args->in.num_engines, args->in.engine_command);
break;
case CS_TYPE_FLUSH_PCI_HBW_WRITES: case CS_TYPE_FLUSH_PCI_HBW_WRITES:
rc = cs_ioctl_flush_pci_hbw_writes(hpriv); rc = cs_ioctl_flush_pci_hbw_writes(hpriv);
break; break;
@ -3143,8 +3209,9 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
struct hl_user_pending_interrupt *cb_last = struct hl_user_pending_interrupt *cb_last =
(struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address + (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
(ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt)); (ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
unsigned long flags, iter_counter = 0; unsigned long iter_counter = 0;
u64 current_cq_counter; u64 current_cq_counter;
ktime_t timestamp;
/* Validate ts_offset not exceeding last max */ /* Validate ts_offset not exceeding last max */
if (requested_offset_record >= cb_last) { if (requested_offset_record >= cb_last) {
@ -3153,8 +3220,10 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
return -EINVAL; return -EINVAL;
} }
timestamp = ktime_get();
start_over: start_over:
spin_lock_irqsave(wait_list_lock, flags); spin_lock(wait_list_lock);
/* Unregister only if we didn't reach the target value /* Unregister only if we didn't reach the target value
* since in this case there will be no handling in irq context * since in this case there will be no handling in irq context
@ -3165,7 +3234,7 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
current_cq_counter = *requested_offset_record->cq_kernel_addr; current_cq_counter = *requested_offset_record->cq_kernel_addr;
if (current_cq_counter < requested_offset_record->cq_target_value) { if (current_cq_counter < requested_offset_record->cq_target_value) {
list_del(&requested_offset_record->wait_list_node); list_del(&requested_offset_record->wait_list_node);
spin_unlock_irqrestore(wait_list_lock, flags); spin_unlock(wait_list_lock);
hl_mmap_mem_buf_put(requested_offset_record->ts_reg_info.buf); hl_mmap_mem_buf_put(requested_offset_record->ts_reg_info.buf);
hl_cb_put(requested_offset_record->ts_reg_info.cq_cb); hl_cb_put(requested_offset_record->ts_reg_info.cq_cb);
@ -3176,13 +3245,14 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
dev_dbg(buf->mmg->dev, dev_dbg(buf->mmg->dev,
"ts node in middle of irq handling\n"); "ts node in middle of irq handling\n");
/* irq handling in the middle give it time to finish */ /* irq thread handling in the middle give it time to finish */
spin_unlock_irqrestore(wait_list_lock, flags); spin_unlock(wait_list_lock);
usleep_range(1, 10); usleep_range(100, 1000);
if (++iter_counter == MAX_TS_ITER_NUM) { if (++iter_counter == MAX_TS_ITER_NUM) {
dev_err(buf->mmg->dev, dev_err(buf->mmg->dev,
"handling registration interrupt took too long!!\n"); "Timestamp offset processing reached timeout of %lld ms\n",
return -EINVAL; ktime_ms_delta(ktime_get(), timestamp));
return -EAGAIN;
} }
goto start_over; goto start_over;
@ -3197,7 +3267,7 @@ static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
(u64 *) cq_cb->kernel_address + cq_offset; (u64 *) cq_cb->kernel_address + cq_offset;
requested_offset_record->cq_target_value = target_value; requested_offset_record->cq_target_value = target_value;
spin_unlock_irqrestore(wait_list_lock, flags); spin_unlock(wait_list_lock);
} }
*pend = requested_offset_record; *pend = requested_offset_record;
@ -3217,7 +3287,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
struct hl_user_pending_interrupt *pend; struct hl_user_pending_interrupt *pend;
struct hl_mmap_mem_buf *buf; struct hl_mmap_mem_buf *buf;
struct hl_cb *cq_cb; struct hl_cb *cq_cb;
unsigned long timeout, flags; unsigned long timeout;
long completion_rc; long completion_rc;
int rc = 0; int rc = 0;
@ -3264,7 +3334,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
pend->cq_target_value = target_value; pend->cq_target_value = target_value;
} }
spin_lock_irqsave(&interrupt->wait_list_lock, flags); spin_lock(&interrupt->wait_list_lock);
/* We check for completion value as interrupt could have been received /* We check for completion value as interrupt could have been received
* before we added the node to the wait list * before we added the node to the wait list
@ -3272,7 +3342,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
if (*pend->cq_kernel_addr >= target_value) { if (*pend->cq_kernel_addr >= target_value) {
if (register_ts_record) if (register_ts_record)
pend->ts_reg_info.in_use = 0; pend->ts_reg_info.in_use = 0;
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
*status = HL_WAIT_CS_STATUS_COMPLETED; *status = HL_WAIT_CS_STATUS_COMPLETED;
@ -3284,7 +3354,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
goto set_timestamp; goto set_timestamp;
} }
} else if (!timeout_us) { } else if (!timeout_us) {
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
*status = HL_WAIT_CS_STATUS_BUSY; *status = HL_WAIT_CS_STATUS_BUSY;
pend->fence.timestamp = ktime_get(); pend->fence.timestamp = ktime_get();
goto set_timestamp; goto set_timestamp;
@ -3309,7 +3379,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
pend->ts_reg_info.in_use = 1; pend->ts_reg_info.in_use = 1;
list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
if (register_ts_record) { if (register_ts_record) {
rc = *status = HL_WAIT_CS_STATUS_COMPLETED; rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
@ -3353,9 +3423,9 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
* for ts record, the node will be deleted in the irq handler after * for ts record, the node will be deleted in the irq handler after
* we reach the target value. * we reach the target value.
*/ */
spin_lock_irqsave(&interrupt->wait_list_lock, flags); spin_lock(&interrupt->wait_list_lock);
list_del(&pend->wait_list_node); list_del(&pend->wait_list_node);
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
set_timestamp: set_timestamp:
*timestamp = ktime_to_ns(pend->fence.timestamp); *timestamp = ktime_to_ns(pend->fence.timestamp);
@ -3383,7 +3453,7 @@ static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_
u64 *timestamp) u64 *timestamp)
{ {
struct hl_user_pending_interrupt *pend; struct hl_user_pending_interrupt *pend;
unsigned long timeout, flags; unsigned long timeout;
u64 completion_value; u64 completion_value;
long completion_rc; long completion_rc;
int rc = 0; int rc = 0;
@ -3403,9 +3473,9 @@ static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_
/* Add pending user interrupt to relevant list for the interrupt /* Add pending user interrupt to relevant list for the interrupt
* handler to monitor * handler to monitor
*/ */
spin_lock_irqsave(&interrupt->wait_list_lock, flags); spin_lock(&interrupt->wait_list_lock);
list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
/* We check for completion value as interrupt could have been received /* We check for completion value as interrupt could have been received
* before we added the node to the wait list * before we added the node to the wait list
@ -3436,14 +3506,14 @@ static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_
* If comparison fails, keep waiting until timeout expires * If comparison fails, keep waiting until timeout expires
*/ */
if (completion_rc > 0) { if (completion_rc > 0) {
spin_lock_irqsave(&interrupt->wait_list_lock, flags); spin_lock(&interrupt->wait_list_lock);
/* reinit_completion must be called before we check for user /* reinit_completion must be called before we check for user
* completion value, otherwise, if interrupt is received after * completion value, otherwise, if interrupt is received after
* the comparison and before the next wait_for_completion, * the comparison and before the next wait_for_completion,
* we will reach timeout and fail * we will reach timeout and fail
*/ */
reinit_completion(&pend->fence.completion); reinit_completion(&pend->fence.completion);
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) { if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
dev_err(hdev->dev, "Failed to copy completion value from user\n"); dev_err(hdev->dev, "Failed to copy completion value from user\n");
@ -3480,9 +3550,9 @@ static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_
} }
remove_pending_user_interrupt: remove_pending_user_interrupt:
spin_lock_irqsave(&interrupt->wait_list_lock, flags); spin_lock(&interrupt->wait_list_lock);
list_del(&pend->wait_list_node); list_del(&pend->wait_list_node);
spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); spin_unlock(&interrupt->wait_list_lock);
*timestamp = ktime_to_ns(pend->fence.timestamp); *timestamp = ktime_to_ns(pend->fence.timestamp);

View file

@ -258,7 +258,7 @@ static int vm_show(struct seq_file *s, void *data)
if (!dev_entry->hdev->mmu_enable) if (!dev_entry->hdev->mmu_enable)
return 0; return 0;
spin_lock(&dev_entry->ctx_mem_hash_spinlock); mutex_lock(&dev_entry->ctx_mem_hash_mutex);
list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) { list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) {
once = false; once = false;
@ -329,7 +329,7 @@ static int vm_show(struct seq_file *s, void *data)
} }
spin_unlock(&dev_entry->ctx_mem_hash_spinlock); mutex_unlock(&dev_entry->ctx_mem_hash_mutex);
ctx = hl_get_compute_ctx(dev_entry->hdev); ctx = hl_get_compute_ctx(dev_entry->hdev);
if (ctx) { if (ctx) {
@ -1583,59 +1583,183 @@ static const struct file_operations hl_debugfs_fops = {
.release = single_release, .release = single_release,
}; };
static void add_secured_nodes(struct hl_dbg_device_entry *dev_entry) static void add_secured_nodes(struct hl_dbg_device_entry *dev_entry, struct dentry *root)
{ {
debugfs_create_u8("i2c_bus", debugfs_create_u8("i2c_bus",
0644, 0644,
dev_entry->root, root,
&dev_entry->i2c_bus); &dev_entry->i2c_bus);
debugfs_create_u8("i2c_addr", debugfs_create_u8("i2c_addr",
0644, 0644,
dev_entry->root, root,
&dev_entry->i2c_addr); &dev_entry->i2c_addr);
debugfs_create_u8("i2c_reg", debugfs_create_u8("i2c_reg",
0644, 0644,
dev_entry->root, root,
&dev_entry->i2c_reg); &dev_entry->i2c_reg);
debugfs_create_u8("i2c_len", debugfs_create_u8("i2c_len",
0644, 0644,
dev_entry->root, root,
&dev_entry->i2c_len); &dev_entry->i2c_len);
debugfs_create_file("i2c_data", debugfs_create_file("i2c_data",
0644, 0644,
dev_entry->root, root,
dev_entry, dev_entry,
&hl_i2c_data_fops); &hl_i2c_data_fops);
debugfs_create_file("led0", debugfs_create_file("led0",
0200, 0200,
dev_entry->root, root,
dev_entry, dev_entry,
&hl_led0_fops); &hl_led0_fops);
debugfs_create_file("led1", debugfs_create_file("led1",
0200, 0200,
dev_entry->root, root,
dev_entry, dev_entry,
&hl_led1_fops); &hl_led1_fops);
debugfs_create_file("led2", debugfs_create_file("led2",
0200, 0200,
dev_entry->root, root,
dev_entry, dev_entry,
&hl_led2_fops); &hl_led2_fops);
} }
static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_entry *dev_entry,
struct dentry *root)
{
int count = ARRAY_SIZE(hl_debugfs_list);
struct hl_debugfs_entry *entry;
int i;
debugfs_create_x64("memory_scrub_val",
0644,
root,
&hdev->memory_scrub_val);
debugfs_create_file("memory_scrub",
0200,
root,
dev_entry,
&hl_mem_scrub_fops);
debugfs_create_x64("addr",
0644,
root,
&dev_entry->addr);
debugfs_create_file("data32",
0644,
root,
dev_entry,
&hl_data32b_fops);
debugfs_create_file("data64",
0644,
root,
dev_entry,
&hl_data64b_fops);
debugfs_create_file("set_power_state",
0200,
root,
dev_entry,
&hl_power_fops);
debugfs_create_file("device",
0200,
root,
dev_entry,
&hl_device_fops);
debugfs_create_file("clk_gate",
0200,
root,
dev_entry,
&hl_clk_gate_fops);
debugfs_create_file("stop_on_err",
0644,
root,
dev_entry,
&hl_stop_on_err_fops);
debugfs_create_file("dump_security_violations",
0644,
root,
dev_entry,
&hl_security_violations_fops);
debugfs_create_file("dump_razwi_events",
0644,
root,
dev_entry,
&hl_razwi_check_fops);
debugfs_create_file("dma_size",
0200,
root,
dev_entry,
&hl_dma_size_fops);
debugfs_create_blob("data_dma",
0400,
root,
&dev_entry->data_dma_blob_desc);
debugfs_create_file("monitor_dump_trig",
0200,
root,
dev_entry,
&hl_monitor_dump_fops);
debugfs_create_blob("monitor_dump",
0400,
root,
&dev_entry->mon_dump_blob_desc);
debugfs_create_x8("skip_reset_on_timeout",
0644,
root,
&hdev->reset_info.skip_reset_on_timeout);
debugfs_create_file("state_dump",
0600,
root,
dev_entry,
&hl_state_dump_fops);
debugfs_create_file("timeout_locked",
0644,
root,
dev_entry,
&hl_timeout_locked_fops);
debugfs_create_u32("device_release_watchdog_timeout",
0644,
root,
&hdev->device_release_watchdog_timeout_sec);
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
0444,
root,
entry,
&hl_debugfs_fops);
entry->info_ent = &hl_debugfs_list[i];
entry->dev_entry = dev_entry;
}
}
void hl_debugfs_add_device(struct hl_device *hdev) void hl_debugfs_add_device(struct hl_device *hdev)
{ {
struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
int count = ARRAY_SIZE(hl_debugfs_list); int count = ARRAY_SIZE(hl_debugfs_list);
struct hl_debugfs_entry *entry;
int i;
dev_entry->hdev = hdev; dev_entry->hdev = hdev;
dev_entry->entry_arr = kmalloc_array(count, dev_entry->entry_arr = kmalloc_array(count,
@ -1661,131 +1785,14 @@ void hl_debugfs_add_device(struct hl_device *hdev)
spin_lock_init(&dev_entry->cs_spinlock); spin_lock_init(&dev_entry->cs_spinlock);
spin_lock_init(&dev_entry->cs_job_spinlock); spin_lock_init(&dev_entry->cs_job_spinlock);
spin_lock_init(&dev_entry->userptr_spinlock); spin_lock_init(&dev_entry->userptr_spinlock);
spin_lock_init(&dev_entry->ctx_mem_hash_spinlock); mutex_init(&dev_entry->ctx_mem_hash_mutex);
dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
hl_debug_root); hl_debug_root);
debugfs_create_x64("memory_scrub_val", add_files_to_device(hdev, dev_entry, dev_entry->root);
0644,
dev_entry->root,
&hdev->memory_scrub_val);
debugfs_create_file("memory_scrub",
0200,
dev_entry->root,
dev_entry,
&hl_mem_scrub_fops);
debugfs_create_x64("addr",
0644,
dev_entry->root,
&dev_entry->addr);
debugfs_create_file("data32",
0644,
dev_entry->root,
dev_entry,
&hl_data32b_fops);
debugfs_create_file("data64",
0644,
dev_entry->root,
dev_entry,
&hl_data64b_fops);
debugfs_create_file("set_power_state",
0200,
dev_entry->root,
dev_entry,
&hl_power_fops);
debugfs_create_file("device",
0200,
dev_entry->root,
dev_entry,
&hl_device_fops);
debugfs_create_file("clk_gate",
0200,
dev_entry->root,
dev_entry,
&hl_clk_gate_fops);
debugfs_create_file("stop_on_err",
0644,
dev_entry->root,
dev_entry,
&hl_stop_on_err_fops);
debugfs_create_file("dump_security_violations",
0644,
dev_entry->root,
dev_entry,
&hl_security_violations_fops);
debugfs_create_file("dump_razwi_events",
0644,
dev_entry->root,
dev_entry,
&hl_razwi_check_fops);
debugfs_create_file("dma_size",
0200,
dev_entry->root,
dev_entry,
&hl_dma_size_fops);
debugfs_create_blob("data_dma",
0400,
dev_entry->root,
&dev_entry->data_dma_blob_desc);
debugfs_create_file("monitor_dump_trig",
0200,
dev_entry->root,
dev_entry,
&hl_monitor_dump_fops);
debugfs_create_blob("monitor_dump",
0400,
dev_entry->root,
&dev_entry->mon_dump_blob_desc);
debugfs_create_x8("skip_reset_on_timeout",
0644,
dev_entry->root,
&hdev->reset_info.skip_reset_on_timeout);
debugfs_create_file("state_dump",
0600,
dev_entry->root,
dev_entry,
&hl_state_dump_fops);
debugfs_create_file("timeout_locked",
0644,
dev_entry->root,
dev_entry,
&hl_timeout_locked_fops);
debugfs_create_u32("device_release_watchdog_timeout",
0644,
dev_entry->root,
&hdev->device_release_watchdog_timeout_sec);
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
0444,
dev_entry->root,
entry,
&hl_debugfs_fops);
entry->info_ent = &hl_debugfs_list[i];
entry->dev_entry = dev_entry;
}
if (!hdev->asic_prop.fw_security_enabled) if (!hdev->asic_prop.fw_security_enabled)
add_secured_nodes(dev_entry); add_secured_nodes(dev_entry, dev_entry->root);
} }
void hl_debugfs_remove_device(struct hl_device *hdev) void hl_debugfs_remove_device(struct hl_device *hdev)
@ -1795,6 +1802,7 @@ void hl_debugfs_remove_device(struct hl_device *hdev)
debugfs_remove_recursive(entry->root); debugfs_remove_recursive(entry->root);
mutex_destroy(&entry->ctx_mem_hash_mutex);
mutex_destroy(&entry->file_mutex); mutex_destroy(&entry->file_mutex);
vfree(entry->data_dma_blob_desc.data); vfree(entry->data_dma_blob_desc.data);
@ -1901,18 +1909,18 @@ void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx)
{ {
struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
spin_lock(&dev_entry->ctx_mem_hash_spinlock); mutex_lock(&dev_entry->ctx_mem_hash_mutex);
list_add(&ctx->debugfs_list, &dev_entry->ctx_mem_hash_list); list_add(&ctx->debugfs_list, &dev_entry->ctx_mem_hash_list);
spin_unlock(&dev_entry->ctx_mem_hash_spinlock); mutex_unlock(&dev_entry->ctx_mem_hash_mutex);
} }
void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx) void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx)
{ {
struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
spin_lock(&dev_entry->ctx_mem_hash_spinlock); mutex_lock(&dev_entry->ctx_mem_hash_mutex);
list_del(&ctx->debugfs_list); list_del(&ctx->debugfs_list);
spin_unlock(&dev_entry->ctx_mem_hash_spinlock); mutex_unlock(&dev_entry->ctx_mem_hash_mutex);
} }
/** /**

View file

@ -46,7 +46,7 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status)
static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id) static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id)
{ {
bool reset_required = false; bool reset_required = false;
u32 irq_status; u32 irq_status, event_mask;
irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
@ -54,17 +54,27 @@ static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_
dec_print_abnrm_intr_source(hdev, irq_status); dec_print_abnrm_intr_source(hdev, irq_status);
if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK)
reset_required = true;
/* Clear the interrupt */ /* Clear the interrupt */
WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
/* Flush the interrupt clear */ /* Flush the interrupt clear */
RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
if (reset_required) if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
hl_device_reset(hdev, HL_DRV_RESET_HARD); reset_required = true;
event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
} else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) {
event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
} else {
event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
}
if (reset_required) {
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
hl_device_cond_reset(hdev, 0, event_mask);
} else {
hl_notifier_event_send_all(hdev, event_mask);
}
} }
static void dec_completion_abnrm(struct work_struct *work) static void dec_completion_abnrm(struct work_struct *work)

View file

@ -22,7 +22,6 @@
enum dma_alloc_type { enum dma_alloc_type {
DMA_ALLOC_COHERENT, DMA_ALLOC_COHERENT,
DMA_ALLOC_CPU_ACCESSIBLE,
DMA_ALLOC_POOL, DMA_ALLOC_POOL,
}; };
@ -121,9 +120,6 @@ static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t
case DMA_ALLOC_COHERENT: case DMA_ALLOC_COHERENT:
ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag); ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag);
break; break;
case DMA_ALLOC_CPU_ACCESSIBLE:
ptr = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
break;
case DMA_ALLOC_POOL: case DMA_ALLOC_POOL:
ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle); ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle);
break; break;
@ -147,9 +143,6 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c
case DMA_ALLOC_COHERENT: case DMA_ALLOC_COHERENT:
hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle);
break; break;
case DMA_ALLOC_CPU_ACCESSIBLE:
hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, cpu_addr);
break;
case DMA_ALLOC_POOL: case DMA_ALLOC_POOL:
hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle); hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle);
break; break;
@ -170,18 +163,6 @@ void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void
hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller); hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller);
} }
void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle, const char *caller)
{
return hl_dma_alloc_common(hdev, size, dma_handle, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller);
}
void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr,
const char *caller)
{
hl_asic_dma_free_common(hdev, size, vaddr, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller);
}
void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags,
dma_addr_t *dma_handle, const char *caller) dma_addr_t *dma_handle, const char *caller)
{ {
@ -194,6 +175,16 @@ void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_
hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller); hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller);
} }
void *hl_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle)
{
return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
}
void hl_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr)
{
hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr);
}
int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir)
{ {
struct asic_fixed_properties *prop = &hdev->asic_prop; struct asic_fixed_properties *prop = &hdev->asic_prop;
@ -389,18 +380,17 @@ bool hl_ctrl_device_operational(struct hl_device *hdev,
static void print_idle_status_mask(struct hl_device *hdev, const char *message, static void print_idle_status_mask(struct hl_device *hdev, const char *message,
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE]) u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE])
{ {
u32 pad_width[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {}; if (idle_mask[3])
dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx_%016llx)\n",
BUILD_BUG_ON(HL_BUSY_ENGINES_MASK_EXT_SIZE != 4); message, idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
else if (idle_mask[2])
pad_width[3] = idle_mask[3] ? 16 : 0; dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx)\n",
pad_width[2] = idle_mask[2] || pad_width[3] ? 16 : 0; message, idle_mask[2], idle_mask[1], idle_mask[0]);
pad_width[1] = idle_mask[1] || pad_width[2] ? 16 : 0; else if (idle_mask[1])
pad_width[0] = idle_mask[0] || pad_width[1] ? 16 : 0; dev_err(hdev->dev, "%s (mask %#llx_%016llx)\n",
message, idle_mask[1], idle_mask[0]);
dev_err(hdev->dev, "%s (mask %0*llx_%0*llx_%0*llx_%0*llx)\n", else
message, pad_width[3], idle_mask[3], pad_width[2], idle_mask[2], dev_err(hdev->dev, "%s (mask %#llx)\n", message, idle_mask[0]);
pad_width[1], idle_mask[1], pad_width[0], idle_mask[0]);
} }
static void hpriv_release(struct kref *ref) static void hpriv_release(struct kref *ref)
@ -423,6 +413,9 @@ static void hpriv_release(struct kref *ref)
mutex_destroy(&hpriv->ctx_lock); mutex_destroy(&hpriv->ctx_lock);
mutex_destroy(&hpriv->restore_phase_mutex); mutex_destroy(&hpriv->restore_phase_mutex);
/* There should be no memory buffers at this point and handles IDR can be destroyed */
hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
/* Device should be reset if reset-upon-device-release is enabled, or if there is a pending /* Device should be reset if reset-upon-device-release is enabled, or if there is a pending
* reset that waits for device release. * reset that waits for device release.
*/ */
@ -492,6 +485,36 @@ int hl_hpriv_put(struct hl_fpriv *hpriv)
return kref_put(&hpriv->refcount, hpriv_release); return kref_put(&hpriv->refcount, hpriv_release);
} }
static void print_device_in_use_info(struct hl_device *hdev, const char *message)
{
u32 active_cs_num, dmabuf_export_cnt;
bool unknown_reason = true;
char buf[128];
size_t size;
int offset;
size = sizeof(buf);
offset = 0;
active_cs_num = hl_get_active_cs_num(hdev);
if (active_cs_num) {
unknown_reason = false;
offset += scnprintf(buf + offset, size - offset, " [%u active CS]", active_cs_num);
}
dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt);
if (dmabuf_export_cnt) {
unknown_reason = false;
offset += scnprintf(buf + offset, size - offset, " [%u exported dma-buf]",
dmabuf_export_cnt);
}
if (unknown_reason)
scnprintf(buf + offset, size - offset, " [unknown reason]");
dev_notice(hdev->dev, "%s%s\n", message, buf);
}
/* /*
* hl_device_release - release function for habanalabs device * hl_device_release - release function for habanalabs device
* *
@ -514,17 +537,20 @@ static int hl_device_release(struct inode *inode, struct file *filp)
} }
hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
/* Memory buffers might be still in use at this point and thus the handles IDR destruction
* is postponed to hpriv_release().
*/
hl_mem_mgr_fini(&hpriv->mem_mgr); hl_mem_mgr_fini(&hpriv->mem_mgr);
hdev->compute_ctx_in_release = 1; hdev->compute_ctx_in_release = 1;
if (!hl_hpriv_put(hpriv)) { if (!hl_hpriv_put(hpriv)) {
dev_notice(hdev->dev, "User process closed FD but device still in use\n"); print_device_in_use_info(hdev, "User process closed FD but device still in use");
hl_device_reset(hdev, HL_DRV_RESET_HARD); hl_device_reset(hdev, HL_DRV_RESET_HARD);
} }
hdev->last_open_session_duration_jif = hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif;
jiffies - hdev->last_successful_open_jif;
return 0; return 0;
} }
@ -617,7 +643,7 @@ static void device_release_func(struct device *dev)
* device_init_cdev - Initialize cdev and device for habanalabs device * device_init_cdev - Initialize cdev and device for habanalabs device
* *
* @hdev: pointer to habanalabs device structure * @hdev: pointer to habanalabs device structure
* @hclass: pointer to the class object of the device * @class: pointer to the class object of the device
* @minor: minor number of the specific device * @minor: minor number of the specific device
* @fpos: file operations to install for this device * @fpos: file operations to install for this device
* @name: name of the device as it will appear in the filesystem * @name: name of the device as it will appear in the filesystem
@ -626,7 +652,7 @@ static void device_release_func(struct device *dev)
* *
* Initialize a cdev and a Linux device for habanalabs's device. * Initialize a cdev and a Linux device for habanalabs's device.
*/ */
static int device_init_cdev(struct hl_device *hdev, struct class *hclass, static int device_init_cdev(struct hl_device *hdev, struct class *class,
int minor, const struct file_operations *fops, int minor, const struct file_operations *fops,
char *name, struct cdev *cdev, char *name, struct cdev *cdev,
struct device **dev) struct device **dev)
@ -640,7 +666,7 @@ static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
device_initialize(*dev); device_initialize(*dev);
(*dev)->devt = MKDEV(hdev->major, minor); (*dev)->devt = MKDEV(hdev->major, minor);
(*dev)->class = hclass; (*dev)->class = class;
(*dev)->release = device_release_func; (*dev)->release = device_release_func;
dev_set_drvdata(*dev, hdev); dev_set_drvdata(*dev, hdev);
dev_set_name(*dev, "%s", name); dev_set_name(*dev, "%s", name);
@ -733,14 +759,14 @@ static void device_hard_reset_pending(struct work_struct *work)
static void device_release_watchdog_func(struct work_struct *work) static void device_release_watchdog_func(struct work_struct *work)
{ {
struct hl_device_reset_work *device_release_watchdog_work = struct hl_device_reset_work *watchdog_work =
container_of(work, struct hl_device_reset_work, reset_work.work); container_of(work, struct hl_device_reset_work, reset_work.work);
struct hl_device *hdev = device_release_watchdog_work->hdev; struct hl_device *hdev = watchdog_work->hdev;
u32 flags; u32 flags;
dev_dbg(hdev->dev, "Device wasn't released in time. Initiate device reset.\n"); dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n");
flags = device_release_watchdog_work->flags | HL_DRV_RESET_FROM_WD_THR; flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR;
hl_device_reset(hdev, flags); hl_device_reset(hdev, flags);
} }
@ -805,7 +831,7 @@ static int device_early_init(struct hl_device *hdev)
} }
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i); snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
if (hdev->cq_wq[i] == NULL) { if (hdev->cq_wq[i] == NULL) {
dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
@ -814,14 +840,16 @@ static int device_early_init(struct hl_device *hdev)
} }
} }
hdev->eq_wq = create_singlethread_workqueue("hl-events"); snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx);
hdev->eq_wq = create_singlethread_workqueue(workq_name);
if (hdev->eq_wq == NULL) { if (hdev->eq_wq == NULL) {
dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
rc = -ENOMEM; rc = -ENOMEM;
goto free_cq_wq; goto free_cq_wq;
} }
hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0); snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx);
hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->cs_cmplt_wq) { if (!hdev->cs_cmplt_wq) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to allocate CS completions workqueue\n"); "Failed to allocate CS completions workqueue\n");
@ -829,7 +857,8 @@ static int device_early_init(struct hl_device *hdev)
goto free_eq_wq; goto free_eq_wq;
} }
hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0); snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx);
hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->ts_free_obj_wq) { if (!hdev->ts_free_obj_wq) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to allocate Timestamp registration free workqueue\n"); "Failed to allocate Timestamp registration free workqueue\n");
@ -837,15 +866,15 @@ static int device_early_init(struct hl_device *hdev)
goto free_cs_cmplt_wq; goto free_cs_cmplt_wq;
} }
hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx);
hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->prefetch_wq) { if (!hdev->prefetch_wq) {
dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
rc = -ENOMEM; rc = -ENOMEM;
goto free_ts_free_wq; goto free_ts_free_wq;
} }
hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
GFP_KERNEL);
if (!hdev->hl_chip_info) { if (!hdev->hl_chip_info) {
rc = -ENOMEM; rc = -ENOMEM;
goto free_prefetch_wq; goto free_prefetch_wq;
@ -857,7 +886,8 @@ static int device_early_init(struct hl_device *hdev)
hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
hdev->reset_wq = create_singlethread_workqueue("hl_device_reset"); snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx);
hdev->reset_wq = create_singlethread_workqueue(workq_name);
if (!hdev->reset_wq) { if (!hdev->reset_wq) {
rc = -ENOMEM; rc = -ENOMEM;
dev_err(hdev->dev, "Failed to create device reset WQ\n"); dev_err(hdev->dev, "Failed to create device reset WQ\n");
@ -887,6 +917,7 @@ static int device_early_init(struct hl_device *hdev)
free_cb_mgr: free_cb_mgr:
hl_mem_mgr_fini(&hdev->kernel_mem_mgr); hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
free_chip_info: free_chip_info:
kfree(hdev->hl_chip_info); kfree(hdev->hl_chip_info);
free_prefetch_wq: free_prefetch_wq:
@ -930,6 +961,7 @@ static void device_early_fini(struct hl_device *hdev)
mutex_destroy(&hdev->clk_throttling.lock); mutex_destroy(&hdev->clk_throttling.lock);
hl_mem_mgr_fini(&hdev->kernel_mem_mgr); hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
kfree(hdev->hl_chip_info); kfree(hdev->hl_chip_info);
@ -953,6 +985,8 @@ static void hl_device_heartbeat(struct work_struct *work)
{ {
struct hl_device *hdev = container_of(work, struct hl_device, struct hl_device *hdev = container_of(work, struct hl_device,
work_heartbeat.work); work_heartbeat.work);
struct hl_info_fw_err_info info = {0};
u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
if (!hl_device_operational(hdev, NULL)) if (!hl_device_operational(hdev, NULL))
goto reschedule; goto reschedule;
@ -963,7 +997,10 @@ static void hl_device_heartbeat(struct work_struct *work)
if (hl_device_operational(hdev, NULL)) if (hl_device_operational(hdev, NULL))
dev_err(hdev->dev, "Device heartbeat failed!\n"); dev_err(hdev->dev, "Device heartbeat failed!\n");
hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
info.event_mask = &event_mask;
hl_handle_fw_err(hdev, &info);
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask);
return; return;
@ -1402,7 +1439,7 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
*/ */
if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
dev_warn(hdev->dev, dev_warn(hdev->dev,
"Failed to disable PCI access by F/W\n"); "Failed to disable FW's PCI access\n");
} }
} }
@ -1424,12 +1461,11 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
*/ */
int hl_device_reset(struct hl_device *hdev, u32 flags) int hl_device_reset(struct hl_device *hdev, u32 flags)
{ {
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, bool hard_reset, from_hard_reset_thread, fw_reset, reset_upon_device_release,
reset_upon_device_release = false, schedule_hard_reset = false, schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread;
delay_reset, from_dev_release, from_watchdog_thread;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
struct hl_ctx *ctx; struct hl_ctx *ctx;
int i, rc; int i, rc, hw_fini_rc;
if (!hdev->init_done) { if (!hdev->init_done) {
dev_err(hdev->dev, "Can't reset before initialization is done\n"); dev_err(hdev->dev, "Can't reset before initialization is done\n");
@ -1442,6 +1478,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE); from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE);
delay_reset = !!(flags & HL_DRV_RESET_DELAY); delay_reset = !!(flags & HL_DRV_RESET_DELAY);
from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR); from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR);
reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) { if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) {
dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n"); dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
@ -1449,30 +1486,26 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
} }
if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { if (!hard_reset && !hdev->asic_prop.supports_compute_reset) {
hard_instead_soft = true; dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n");
hard_reset = true; hard_reset = true;
} }
if (hdev->reset_upon_device_release && from_dev_release) { if (reset_upon_device_release) {
if (hard_reset) { if (hard_reset) {
dev_crit(hdev->dev, dev_crit(hdev->dev,
"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n"); "Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
return -EINVAL; return -EINVAL;
} }
reset_upon_device_release = true;
goto do_reset; goto do_reset;
} }
if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
hard_instead_soft = true; dev_dbg(hdev->dev,
"asic doesn't allow inference soft reset - do hard-reset instead\n");
hard_reset = true; hard_reset = true;
} }
if (hard_instead_soft)
dev_dbg(hdev->dev, "Doing hard-reset instead of compute reset\n");
do_reset: do_reset:
/* Re-entry of reset thread */ /* Re-entry of reset thread */
if (from_hard_reset_thread && hdev->process_kill_trial_cnt) if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
@ -1480,14 +1513,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
/* /*
* Prevent concurrency in this function - only one reset should be * Prevent concurrency in this function - only one reset should be
* done at any given time. Only need to perform this if we didn't * done at any given time. We need to perform this only if we didn't
* get from the dedicated hard reset thread * get here from a dedicated hard reset thread.
*/ */
if (!from_hard_reset_thread) { if (!from_hard_reset_thread) {
/* Block future CS/VM/JOB completion operations */ /* Block future CS/VM/JOB completion operations */
spin_lock(&hdev->reset_info.lock); spin_lock(&hdev->reset_info.lock);
if (hdev->reset_info.in_reset) { if (hdev->reset_info.in_reset) {
/* We only allow scheduling of a hard reset during compute reset */ /* We allow scheduling of a hard reset only during a compute reset */
if (hard_reset && hdev->reset_info.in_compute_reset) if (hard_reset && hdev->reset_info.in_compute_reset)
hdev->reset_info.hard_reset_schedule_flags = flags; hdev->reset_info.hard_reset_schedule_flags = flags;
spin_unlock(&hdev->reset_info.lock); spin_unlock(&hdev->reset_info.lock);
@ -1505,15 +1538,17 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
/* Cancel the device release watchdog work if required. /* Cancel the device release watchdog work if required.
* In case of reset-upon-device-release while the release watchdog work is * In case of reset-upon-device-release while the release watchdog work is
* scheduled, do hard-reset instead of compute-reset. * scheduled due to a hard-reset, do hard-reset instead of compute-reset.
*/ */
if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) {
struct hl_device_reset_work *watchdog_work =
&hdev->device_release_watchdog_work;
hdev->reset_info.watchdog_active = 0; hdev->reset_info.watchdog_active = 0;
if (!from_watchdog_thread) if (!from_watchdog_thread)
cancel_delayed_work_sync( cancel_delayed_work_sync(&watchdog_work->reset_work);
&hdev->device_release_watchdog_work.reset_work);
if (from_dev_release) { if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) {
hdev->reset_info.in_compute_reset = 0; hdev->reset_info.in_compute_reset = 0;
flags |= HL_DRV_RESET_HARD; flags |= HL_DRV_RESET_HARD;
flags &= ~HL_DRV_RESET_DEV_RELEASE; flags &= ~HL_DRV_RESET_DEV_RELEASE;
@ -1524,6 +1559,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
if (delay_reset) if (delay_reset)
usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1); usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1);
escalate_reset_flow:
handle_reset_trigger(hdev, flags); handle_reset_trigger(hdev, flags);
/* This also blocks future CS/VM/JOB completion operations */ /* This also blocks future CS/VM/JOB completion operations */
@ -1539,7 +1575,6 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
dev_dbg(hdev->dev, "Going to reset engines of inference device\n"); dev_dbg(hdev->dev, "Going to reset engines of inference device\n");
} }
again:
if ((hard_reset) && (!from_hard_reset_thread)) { if ((hard_reset) && (!from_hard_reset_thread)) {
hdev->reset_info.hard_reset_pending = true; hdev->reset_info.hard_reset_pending = true;
@ -1592,7 +1627,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
} }
/* Reset the H/W. It will be in idle state after this returns */ /* Reset the H/W. It will be in idle state after this returns */
hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
if (hard_reset) { if (hard_reset) {
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
@ -1619,6 +1654,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hl_ctx_put(ctx); hl_ctx_put(ctx);
} }
if (hw_fini_rc) {
rc = hw_fini_rc;
goto out_err;
}
/* Finished tear-down, starting to re-initialize */ /* Finished tear-down, starting to re-initialize */
if (hard_reset) { if (hard_reset) {
@ -1787,7 +1826,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->disabled = true; hdev->disabled = true;
hard_reset = true; hard_reset = true;
handle_reset_trigger(hdev, flags); handle_reset_trigger(hdev, flags);
goto again; goto escalate_reset_flow;
} }
} }
@ -1804,20 +1843,19 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
"%s Failed to reset! Device is NOT usable\n", "%s Failed to reset! Device is NOT usable\n",
dev_name(&(hdev)->pdev->dev)); dev_name(&(hdev)->pdev->dev));
hdev->reset_info.hard_reset_cnt++; hdev->reset_info.hard_reset_cnt++;
} else if (reset_upon_device_release) {
spin_unlock(&hdev->reset_info.lock);
dev_err(hdev->dev, "Failed to reset device after user release\n");
flags |= HL_DRV_RESET_HARD;
flags &= ~HL_DRV_RESET_DEV_RELEASE;
hard_reset = true;
goto again;
} else { } else {
if (reset_upon_device_release) {
dev_err(hdev->dev, "Failed to reset device after user release\n");
flags &= ~HL_DRV_RESET_DEV_RELEASE;
} else {
dev_err(hdev->dev, "Failed to do compute reset\n");
hdev->reset_info.compute_reset_cnt++;
}
spin_unlock(&hdev->reset_info.lock); spin_unlock(&hdev->reset_info.lock);
dev_err(hdev->dev, "Failed to do compute reset\n");
hdev->reset_info.compute_reset_cnt++;
flags |= HL_DRV_RESET_HARD; flags |= HL_DRV_RESET_HARD;
hard_reset = true; hard_reset = true;
goto again; goto escalate_reset_flow;
} }
hdev->reset_info.in_reset = 0; hdev->reset_info.in_reset = 0;
@ -1840,10 +1878,6 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
{ {
struct hl_ctx *ctx = NULL; struct hl_ctx *ctx = NULL;
/* Device release watchdog is only for hard reset */
if (!(flags & HL_DRV_RESET_HARD) && hdev->asic_prop.allow_inference_soft_reset)
goto device_reset;
/* F/W reset cannot be postponed */ /* F/W reset cannot be postponed */
if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW) if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW)
goto device_reset; goto device_reset;
@ -1871,7 +1905,7 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
goto out; goto out;
hdev->device_release_watchdog_work.flags = flags; hdev->device_release_watchdog_work.flags = flags;
dev_dbg(hdev->dev, "Device is going to be reset in %u sec unless being released\n", dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n",
hdev->device_release_watchdog_timeout_sec); hdev->device_release_watchdog_timeout_sec);
schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work,
msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000)); msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000));
@ -1939,6 +1973,51 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
mutex_unlock(&hdev->fpriv_ctrl_list_lock); mutex_unlock(&hdev->fpriv_ctrl_list_lock);
} }
static int create_cdev(struct hl_device *hdev)
{
char *name;
int rc;
hdev->cdev_idx = hdev->id / 2;
name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx);
if (!name) {
rc = -ENOMEM;
goto out_err;
}
/* Initialize cdev and device structures */
rc = device_init_cdev(hdev, hdev->hclass, hdev->id, &hl_ops, name,
&hdev->cdev, &hdev->dev);
kfree(name);
if (rc)
goto out_err;
name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx);
if (!name) {
rc = -ENOMEM;
goto free_dev;
}
/* Initialize cdev and device structures for control device */
rc = device_init_cdev(hdev, hdev->hclass, hdev->id_control, &hl_ctrl_ops,
name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
kfree(name);
if (rc)
goto free_dev;
return 0;
free_dev:
put_device(hdev->dev);
out_err:
return rc;
}
/* /*
* hl_device_init - main initialization function for habanalabs device * hl_device_init - main initialization function for habanalabs device
* *
@ -1948,48 +2027,19 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
* ASIC specific initialization functions. Finally, create the cdev and the * ASIC specific initialization functions. Finally, create the cdev and the
* Linux device to expose it to the user * Linux device to expose it to the user
*/ */
int hl_device_init(struct hl_device *hdev, struct class *hclass) int hl_device_init(struct hl_device *hdev)
{ {
int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt; int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt;
char *name;
bool add_cdev_sysfs_on_err = false; bool add_cdev_sysfs_on_err = false;
hdev->cdev_idx = hdev->id / 2; rc = create_cdev(hdev);
name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx);
if (!name) {
rc = -ENOMEM;
goto out_disabled;
}
/* Initialize cdev and device structures */
rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
&hdev->cdev, &hdev->dev);
kfree(name);
if (rc) if (rc)
goto out_disabled; goto out_disabled;
name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx);
if (!name) {
rc = -ENOMEM;
goto free_dev;
}
/* Initialize cdev and device structures for control device */
rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
kfree(name);
if (rc)
goto free_dev;
/* Initialize ASIC function pointers and perform early init */ /* Initialize ASIC function pointers and perform early init */
rc = device_early_init(hdev); rc = device_early_init(hdev);
if (rc) if (rc)
goto free_dev_ctrl; goto free_dev;
user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
hdev->asic_prop.user_interrupt_count; hdev->asic_prop.user_interrupt_count;
@ -2241,9 +2291,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
kfree(hdev->user_interrupt); kfree(hdev->user_interrupt);
early_fini: early_fini:
device_early_fini(hdev); device_early_fini(hdev);
free_dev_ctrl:
put_device(hdev->dev_ctrl);
free_dev: free_dev:
put_device(hdev->dev_ctrl);
put_device(hdev->dev); put_device(hdev->dev);
out_disabled: out_disabled:
hdev->disabled = true; hdev->disabled = true;
@ -2364,7 +2413,9 @@ void hl_device_fini(struct hl_device *hdev)
hl_cb_pool_fini(hdev); hl_cb_pool_fini(hdev);
/* Reset the H/W. It will be in idle state after this returns */ /* Reset the H/W. It will be in idle state after this returns */
hdev->asic_funcs->hw_fini(hdev, true, false); rc = hdev->asic_funcs->hw_fini(hdev, true, false);
if (rc)
dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
@ -2566,3 +2617,49 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_
if (event_mask) if (event_mask)
*event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT; *event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT;
} }
static void hl_capture_hw_err(struct hl_device *hdev, u16 event_id)
{
struct hw_err_info *info = &hdev->captured_err_info.hw_err;
/* Capture only the first HW err */
if (atomic_cmpxchg(&info->event_detected, 0, 1))
return;
info->event.timestamp = ktime_to_ns(ktime_get());
info->event.event_id = event_id;
info->event_info_available = true;
}
void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask)
{
hl_capture_hw_err(hdev, event_id);
if (event_mask)
*event_mask |= HL_NOTIFIER_EVENT_CRITICL_HW_ERR;
}
static void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info)
{
struct fw_err_info *info = &hdev->captured_err_info.fw_err;
/* Capture only the first FW error */
if (atomic_cmpxchg(&info->event_detected, 0, 1))
return;
info->event.timestamp = ktime_to_ns(ktime_get());
info->event.err_type = fw_info->err_type;
if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
info->event.event_id = fw_info->event_id;
info->event_info_available = true;
}
void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
{
hl_capture_fw_err(hdev, info);
if (info->event_mask)
*info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
}

View file

@ -3152,7 +3152,7 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in
int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode, int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode,
dma_addr_t buff, u32 *size) dma_addr_t buff, u32 *size)
{ {
struct cpucp_packet pkt = {0}; struct cpucp_packet pkt = {};
u64 result; u64 result;
int rc = 0; int rc = 0;

View file

@ -155,18 +155,12 @@ enum hl_mmu_enablement {
#define hl_asic_dma_alloc_coherent(hdev, size, dma_handle, flags) \ #define hl_asic_dma_alloc_coherent(hdev, size, dma_handle, flags) \
hl_asic_dma_alloc_coherent_caller(hdev, size, dma_handle, flags, __func__) hl_asic_dma_alloc_coherent_caller(hdev, size, dma_handle, flags, __func__)
#define hl_cpu_accessible_dma_pool_alloc(hdev, size, dma_handle) \
hl_cpu_accessible_dma_pool_alloc_caller(hdev, size, dma_handle, __func__)
#define hl_asic_dma_pool_zalloc(hdev, size, mem_flags, dma_handle) \ #define hl_asic_dma_pool_zalloc(hdev, size, mem_flags, dma_handle) \
hl_asic_dma_pool_zalloc_caller(hdev, size, mem_flags, dma_handle, __func__) hl_asic_dma_pool_zalloc_caller(hdev, size, mem_flags, dma_handle, __func__)
#define hl_asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle) \ #define hl_asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle) \
hl_asic_dma_free_coherent_caller(hdev, size, cpu_addr, dma_handle, __func__) hl_asic_dma_free_coherent_caller(hdev, size, cpu_addr, dma_handle, __func__)
#define hl_cpu_accessible_dma_pool_free(hdev, size, vaddr) \
hl_cpu_accessible_dma_pool_free_caller(hdev, size, vaddr, __func__)
#define hl_asic_dma_pool_free(hdev, vaddr, dma_addr) \ #define hl_asic_dma_pool_free(hdev, vaddr, dma_addr) \
hl_asic_dma_pool_free_caller(hdev, vaddr, dma_addr, __func__) hl_asic_dma_pool_free_caller(hdev, vaddr, dma_addr, __func__)
@ -378,6 +372,7 @@ enum hl_cs_type {
CS_RESERVE_SIGNALS, CS_RESERVE_SIGNALS,
CS_UNRESERVE_SIGNALS, CS_UNRESERVE_SIGNALS,
CS_TYPE_ENGINE_CORE, CS_TYPE_ENGINE_CORE,
CS_TYPE_ENGINES,
CS_TYPE_FLUSH_PCI_HBW_WRITES, CS_TYPE_FLUSH_PCI_HBW_WRITES,
}; };
@ -592,6 +587,8 @@ struct hl_hints_range {
* @host_base_address: host physical start address for host DMA from device * @host_base_address: host physical start address for host DMA from device
* @host_end_address: host physical end address for host DMA from device * @host_end_address: host physical end address for host DMA from device
* @max_freq_value: current max clk frequency. * @max_freq_value: current max clk frequency.
* @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
* in order to raise events toward FW.
* @clk_pll_index: clock PLL index that specify which PLL determines the clock * @clk_pll_index: clock PLL index that specify which PLL determines the clock
* we display to the user * we display to the user
* @mmu_pgt_size: MMU page tables total size. * @mmu_pgt_size: MMU page tables total size.
@ -612,8 +609,8 @@ struct hl_hints_range {
* @cb_pool_cb_cnt: number of CBs in the CB pool. * @cb_pool_cb_cnt: number of CBs in the CB pool.
* @cb_pool_cb_size: size of each CB in the CB pool. * @cb_pool_cb_size: size of each CB in the CB pool.
* @decoder_enabled_mask: which decoders are enabled. * @decoder_enabled_mask: which decoders are enabled.
* @decoder_binning_mask: which decoders are binned, 0 means usable and 1 * @decoder_binning_mask: which decoders are binned, 0 means usable and 1 means binned.
* means binned (at most one binned decoder per dcore). * @rotator_enabled_mask: which rotators are enabled.
* @edma_enabled_mask: which EDMAs are enabled. * @edma_enabled_mask: which EDMAs are enabled.
* @edma_binning_mask: which EDMAs are binned, 0 means usable and 1 means * @edma_binning_mask: which EDMAs are binned, 0 means usable and 1 means
* binned (at most one binned DMA). * binned (at most one binned DMA).
@ -648,7 +645,8 @@ struct hl_hints_range {
* which the property supports_user_set_page_size is true * which the property supports_user_set_page_size is true
* (i.e. the DRAM supports multiple page sizes), otherwise * (i.e. the DRAM supports multiple page sizes), otherwise
* it will shall be equal to dram_page_size. * it will shall be equal to dram_page_size.
* @num_engine_cores: number of engine cpu cores * @num_engine_cores: number of engine cpu cores.
* @max_num_of_engines: maximum number of all engines in the ASIC.
* @num_of_special_blocks: special_blocks array size. * @num_of_special_blocks: special_blocks array size.
* @glbl_err_cause_num: global err cause number. * @glbl_err_cause_num: global err cause number.
* @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
@ -663,6 +661,8 @@ struct hl_hints_range {
* @first_available_cq: first available CQ for the user. * @first_available_cq: first available CQ for the user.
* @user_interrupt_count: number of user interrupts. * @user_interrupt_count: number of user interrupts.
* @user_dec_intr_count: number of decoder interrupts exposed to user. * @user_dec_intr_count: number of decoder interrupts exposed to user.
* @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
* @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error.
* @cache_line_size: device cache line size. * @cache_line_size: device cache line size.
* @server_type: Server type that the ASIC is currently installed in. * @server_type: Server type that the ASIC is currently installed in.
* The value is according to enum hl_server_type in uapi file. * The value is according to enum hl_server_type in uapi file.
@ -698,6 +698,7 @@ struct hl_hints_range {
* @supports_user_set_page_size: true if user can set the allocation page size. * @supports_user_set_page_size: true if user can set the allocation page size.
* @dma_mask: the dma mask to be set for this device * @dma_mask: the dma mask to be set for this device
* @supports_advanced_cpucp_rc: true if new cpucp opcodes are supported. * @supports_advanced_cpucp_rc: true if new cpucp opcodes are supported.
* @supports_engine_modes: true if changing engines/engine_cores modes is supported.
*/ */
struct asic_fixed_properties { struct asic_fixed_properties {
struct hw_queue_properties *hw_queues_props; struct hw_queue_properties *hw_queues_props;
@ -739,6 +740,7 @@ struct asic_fixed_properties {
u64 host_base_address; u64 host_base_address;
u64 host_end_address; u64 host_end_address;
u64 max_freq_value; u64 max_freq_value;
u64 engine_core_interrupt_reg_addr;
u32 clk_pll_index; u32 clk_pll_index;
u32 mmu_pgt_size; u32 mmu_pgt_size;
u32 mmu_pte_size; u32 mmu_pte_size;
@ -759,6 +761,7 @@ struct asic_fixed_properties {
u32 cb_pool_cb_size; u32 cb_pool_cb_size;
u32 decoder_enabled_mask; u32 decoder_enabled_mask;
u32 decoder_binning_mask; u32 decoder_binning_mask;
u32 rotator_enabled_mask;
u32 edma_enabled_mask; u32 edma_enabled_mask;
u32 edma_binning_mask; u32 edma_binning_mask;
u32 max_pending_cs; u32 max_pending_cs;
@ -775,6 +778,7 @@ struct asic_fixed_properties {
u32 xbar_edge_enabled_mask; u32 xbar_edge_enabled_mask;
u32 device_mem_alloc_default_page_size; u32 device_mem_alloc_default_page_size;
u32 num_engine_cores; u32 num_engine_cores;
u32 max_num_of_engines;
u32 num_of_special_blocks; u32 num_of_special_blocks;
u32 glbl_err_cause_num; u32 glbl_err_cause_num;
u32 hbw_flush_reg; u32 hbw_flush_reg;
@ -788,6 +792,8 @@ struct asic_fixed_properties {
u16 first_available_cq[HL_MAX_DCORES]; u16 first_available_cq[HL_MAX_DCORES];
u16 user_interrupt_count; u16 user_interrupt_count;
u16 user_dec_intr_count; u16 user_dec_intr_count;
u16 tpc_interrupt_id;
u16 unexpected_user_error_interrupt_id;
u16 cache_line_size; u16 cache_line_size;
u16 server_type; u16 server_type;
u8 completion_queues_count; u8 completion_queues_count;
@ -811,6 +817,7 @@ struct asic_fixed_properties {
u8 supports_user_set_page_size; u8 supports_user_set_page_size;
u8 dma_mask; u8 dma_mask;
u8 supports_advanced_cpucp_rc; u8 supports_advanced_cpucp_rc;
u8 supports_engine_modes;
}; };
/** /**
@ -1096,6 +1103,8 @@ struct hl_cq {
enum hl_user_interrupt_type { enum hl_user_interrupt_type {
HL_USR_INTERRUPT_CQ = 0, HL_USR_INTERRUPT_CQ = 0,
HL_USR_INTERRUPT_DECODER, HL_USR_INTERRUPT_DECODER,
HL_USR_INTERRUPT_TPC,
HL_USR_INTERRUPT_UNEXPECTED
}; };
/** /**
@ -1104,6 +1113,7 @@ enum hl_user_interrupt_type {
* @type: user interrupt type * @type: user interrupt type
* @wait_list_head: head to the list of user threads pending on this interrupt * @wait_list_head: head to the list of user threads pending on this interrupt
* @wait_list_lock: protects wait_list_head * @wait_list_lock: protects wait_list_head
* @timestamp: last timestamp taken upon interrupt
* @interrupt_id: msix interrupt id * @interrupt_id: msix interrupt id
*/ */
struct hl_user_interrupt { struct hl_user_interrupt {
@ -1111,6 +1121,7 @@ struct hl_user_interrupt {
enum hl_user_interrupt_type type; enum hl_user_interrupt_type type;
struct list_head wait_list_head; struct list_head wait_list_head;
spinlock_t wait_list_lock; spinlock_t wait_list_lock;
ktime_t timestamp;
u32 interrupt_id; u32 interrupt_id;
}; };
@ -1562,6 +1573,7 @@ struct engines_data {
* @access_dev_mem: access device memory * @access_dev_mem: access device memory
* @set_dram_bar_base: set the base of the DRAM BAR * @set_dram_bar_base: set the base of the DRAM BAR
* @set_engine_cores: set a config command to engine cores * @set_engine_cores: set a config command to engine cores
* @set_engines: set a config command to user engines
* @send_device_activity: indication to FW about device availability * @send_device_activity: indication to FW about device availability
* @set_dram_properties: set DRAM related properties. * @set_dram_properties: set DRAM related properties.
* @set_binning_masks: set binning/enable masks for all relevant components. * @set_binning_masks: set binning/enable masks for all relevant components.
@ -1574,7 +1586,7 @@ struct hl_asic_funcs {
int (*sw_init)(struct hl_device *hdev); int (*sw_init)(struct hl_device *hdev);
int (*sw_fini)(struct hl_device *hdev); int (*sw_fini)(struct hl_device *hdev);
int (*hw_init)(struct hl_device *hdev); int (*hw_init)(struct hl_device *hdev);
void (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset); int (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset); void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
int (*suspend)(struct hl_device *hdev); int (*suspend)(struct hl_device *hdev);
int (*resume)(struct hl_device *hdev); int (*resume)(struct hl_device *hdev);
@ -1701,6 +1713,8 @@ struct hl_asic_funcs {
u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr); u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
int (*set_engine_cores)(struct hl_device *hdev, u32 *core_ids, int (*set_engine_cores)(struct hl_device *hdev, u32 *core_ids,
u32 num_cores, u32 core_command); u32 num_cores, u32 core_command);
int (*set_engines)(struct hl_device *hdev, u32 *engine_ids,
u32 num_engines, u32 engine_command);
int (*send_device_activity)(struct hl_device *hdev, bool open); int (*send_device_activity)(struct hl_device *hdev, bool open);
int (*set_dram_properties)(struct hl_device *hdev); int (*set_dram_properties)(struct hl_device *hdev);
int (*set_binning_masks)(struct hl_device *hdev); int (*set_binning_masks)(struct hl_device *hdev);
@ -1824,7 +1838,7 @@ struct hl_cs_outcome_store {
* @hpriv: pointer to the private (Kernel Driver) data of the process (fd). * @hpriv: pointer to the private (Kernel Driver) data of the process (fd).
* @hdev: pointer to the device structure. * @hdev: pointer to the device structure.
* @refcount: reference counter for the context. Context is released only when * @refcount: reference counter for the context. Context is released only when
* this hits 0l. It is incremented on CS and CS_WAIT. * this hits 0. It is incremented on CS and CS_WAIT.
* @cs_pending: array of hl fence objects representing pending CS. * @cs_pending: array of hl fence objects representing pending CS.
* @outcome_store: storage data structure used to remember outcomes of completed * @outcome_store: storage data structure used to remember outcomes of completed
* command submissions for a long time after CS id wraparound. * command submissions for a long time after CS id wraparound.
@ -2318,7 +2332,7 @@ struct hl_debugfs_entry {
* @userptr_list: list of available userptrs (virtual memory chunk descriptor). * @userptr_list: list of available userptrs (virtual memory chunk descriptor).
* @userptr_spinlock: protects userptr_list. * @userptr_spinlock: protects userptr_list.
* @ctx_mem_hash_list: list of available contexts with MMU mappings. * @ctx_mem_hash_list: list of available contexts with MMU mappings.
* @ctx_mem_hash_spinlock: protects cb_list. * @ctx_mem_hash_mutex: protects list of available contexts with MMU mappings.
* @data_dma_blob_desc: data DMA descriptor of blob. * @data_dma_blob_desc: data DMA descriptor of blob.
* @mon_dump_blob_desc: monitor dump descriptor of blob. * @mon_dump_blob_desc: monitor dump descriptor of blob.
* @state_dump: data of the system states in case of a bad cs. * @state_dump: data of the system states in case of a bad cs.
@ -2349,7 +2363,7 @@ struct hl_dbg_device_entry {
struct list_head userptr_list; struct list_head userptr_list;
spinlock_t userptr_spinlock; spinlock_t userptr_spinlock;
struct list_head ctx_mem_hash_list; struct list_head ctx_mem_hash_list;
spinlock_t ctx_mem_hash_spinlock; struct mutex ctx_mem_hash_mutex;
struct debugfs_blob_wrapper data_dma_blob_desc; struct debugfs_blob_wrapper data_dma_blob_desc;
struct debugfs_blob_wrapper mon_dump_blob_desc; struct debugfs_blob_wrapper mon_dump_blob_desc;
char *state_dump[HL_STATE_DUMP_HIST_LEN]; char *state_dump[HL_STATE_DUMP_HIST_LEN];
@ -2974,8 +2988,8 @@ struct cs_timeout_info {
* @cq_addr: the address of the current handled command buffer * @cq_addr: the address of the current handled command buffer
* @cq_size: the size of the current handled command buffer * @cq_size: the size of the current handled command buffer
* @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array. * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array.
* should be equal to 1 incase of undefined opcode * should be equal to 1 in case of undefined opcode
* in Upper-CP (specific stream) and equal to 4 incase * in Upper-CP (specific stream) and equal to 4 in case
* of undefined opcode in Lower-CP. * of undefined opcode in Lower-CP.
* @engine_id: engine-id that the error occurred on * @engine_id: engine-id that the error occurred on
* @stream_id: the stream id the error occurred on. In case the stream equals to * @stream_id: the stream id the error occurred on. In case the stream equals to
@ -3031,18 +3045,56 @@ struct razwi_info {
bool razwi_info_available; bool razwi_info_available;
}; };
/**
* struct hw_err_info - HW error information.
* @event: holds information on the event.
* @event_detected: if set as 1, then a HW event was discovered for the
* first time after the driver has finished booting-up.
* currently we assume that only fatal events (that require hard-reset) are
* reported so we don't care of the others that might follow it.
* so once changed to 1, it will remain that way.
* TODO: support multiple events.
* @event_info_available: indicates that a HW event info is now available.
*/
struct hw_err_info {
struct hl_info_hw_err_event event;
atomic_t event_detected;
bool event_info_available;
};
/**
* struct fw_err_info - FW error information.
* @event: holds information on the event.
* @event_detected: if set as 1, then a FW event was discovered for the
* first time after the driver has finished booting-up.
* currently we assume that only fatal events (that require hard-reset) are
* reported so we don't care of the others that might follow it.
* so once changed to 1, it will remain that way.
* TODO: support multiple events.
* @event_info_available: indicates that a HW event info is now available.
*/
struct fw_err_info {
struct hl_info_fw_err_event event;
atomic_t event_detected;
bool event_info_available;
};
/** /**
* struct hl_error_info - holds information collected during an error. * struct hl_error_info - holds information collected during an error.
* @cs_timeout: CS timeout error information. * @cs_timeout: CS timeout error information.
* @razwi_info: RAZWI information. * @razwi_info: RAZWI information.
* @undef_opcode: undefined opcode information. * @undef_opcode: undefined opcode information.
* @page_fault_info: page fault information. * @page_fault_info: page fault information.
* @hw_err: (fatal) hardware error information.
* @fw_err: firmware error information.
*/ */
struct hl_error_info { struct hl_error_info {
struct cs_timeout_info cs_timeout; struct cs_timeout_info cs_timeout;
struct razwi_info razwi_info; struct razwi_info razwi_info;
struct undefined_opcode_info undef_opcode; struct undefined_opcode_info undef_opcode;
struct page_fault_info page_fault_info; struct page_fault_info page_fault_info;
struct hw_err_info hw_err;
struct fw_err_info fw_err;
}; };
/** /**
@ -3090,6 +3142,7 @@ struct hl_reset_info {
* (required only for PCI address match mode) * (required only for PCI address match mode)
* @pcie_bar: array of available PCIe bars virtual addresses. * @pcie_bar: array of available PCIe bars virtual addresses.
* @rmmio: configuration area address on SRAM. * @rmmio: configuration area address on SRAM.
* @hclass: pointer to the habanalabs class.
* @cdev: related char device. * @cdev: related char device.
* @cdev_ctrl: char device for control operations only (INFO IOCTL) * @cdev_ctrl: char device for control operations only (INFO IOCTL)
* @dev: related kernel basic device structure. * @dev: related kernel basic device structure.
@ -3104,6 +3157,8 @@ struct hl_reset_info {
* @user_interrupt: array of hl_user_interrupt. upon the corresponding user * @user_interrupt: array of hl_user_interrupt. upon the corresponding user
* interrupt, driver will monitor the list of fences * interrupt, driver will monitor the list of fences
* registered to this interrupt. * registered to this interrupt.
* @tpc_interrupt: single TPC interrupt for all TPCs.
* @unexpected_error_interrupt: single interrupt for unexpected user error indication.
* @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts. * @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts.
* upon any user CQ interrupt, driver will monitor the * upon any user CQ interrupt, driver will monitor the
* list of fences registered to this common structure. * list of fences registered to this common structure.
@ -3199,6 +3254,7 @@ struct hl_reset_info {
* drams are binned-out * drams are binned-out
* @tpc_binning: contains mask of tpc engines that is received from the f/w which indicates which * @tpc_binning: contains mask of tpc engines that is received from the f/w which indicates which
* tpc engines are binned-out * tpc engines are binned-out
* @dmabuf_export_cnt: number of dma-buf exporting.
* @card_type: Various ASICs have several card types. This indicates the card * @card_type: Various ASICs have several card types. This indicates the card
* type of the current device. * type of the current device.
* @major: habanalabs kernel driver major. * @major: habanalabs kernel driver major.
@ -3253,6 +3309,8 @@ struct hl_reset_info {
* @supports_mmu_prefetch: true if prefetch is supported, otherwise false. * @supports_mmu_prefetch: true if prefetch is supported, otherwise false.
* @reset_upon_device_release: reset the device when the user closes the file descriptor of the * @reset_upon_device_release: reset the device when the user closes the file descriptor of the
* device. * device.
* @supports_ctx_switch: true if a ctx switch is required upon first submission.
* @support_preboot_binning: true if we support read binning info from preboot.
* @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing. * @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing.
* @fw_components: Controls which f/w components to load to the device. There are multiple f/w * @fw_components: Controls which f/w components to load to the device. There are multiple f/w
* stages and sometimes we want to stop at a certain stage. Used only for testing. * stages and sometimes we want to stop at a certain stage. Used only for testing.
@ -3266,14 +3324,13 @@ struct hl_reset_info {
* Used only for testing. * Used only for testing.
* @heartbeat: Controls if we want to enable the heartbeat mechanism vs. the f/w, which verifies * @heartbeat: Controls if we want to enable the heartbeat mechanism vs. the f/w, which verifies
* that the f/w is always alive. Used only for testing. * that the f/w is always alive. Used only for testing.
* @supports_ctx_switch: true if a ctx switch is required upon first submission.
* @support_preboot_binning: true if we support read binning info from preboot.
*/ */
struct hl_device { struct hl_device {
struct pci_dev *pdev; struct pci_dev *pdev;
u64 pcie_bar_phys[HL_PCI_NUM_BARS]; u64 pcie_bar_phys[HL_PCI_NUM_BARS];
void __iomem *pcie_bar[HL_PCI_NUM_BARS]; void __iomem *pcie_bar[HL_PCI_NUM_BARS];
void __iomem *rmmio; void __iomem *rmmio;
struct class *hclass;
struct cdev cdev; struct cdev cdev;
struct cdev cdev_ctrl; struct cdev cdev_ctrl;
struct device *dev; struct device *dev;
@ -3286,6 +3343,8 @@ struct hl_device {
enum hl_asic_type asic_type; enum hl_asic_type asic_type;
struct hl_cq *completion_queue; struct hl_cq *completion_queue;
struct hl_user_interrupt *user_interrupt; struct hl_user_interrupt *user_interrupt;
struct hl_user_interrupt tpc_interrupt;
struct hl_user_interrupt unexpected_error_interrupt;
struct hl_user_interrupt common_user_cq_interrupt; struct hl_user_interrupt common_user_cq_interrupt;
struct hl_user_interrupt common_decoder_interrupt; struct hl_user_interrupt common_decoder_interrupt;
struct hl_cs **shadow_cs_queue; struct hl_cs **shadow_cs_queue;
@ -3369,7 +3428,7 @@ struct hl_device {
u64 fw_comms_poll_interval_usec; u64 fw_comms_poll_interval_usec;
u64 dram_binning; u64 dram_binning;
u64 tpc_binning; u64 tpc_binning;
atomic_t dmabuf_export_cnt;
enum cpucp_card_types card_type; enum cpucp_card_types card_type;
u32 major; u32 major;
u32 high_pll; u32 high_pll;
@ -3412,7 +3471,7 @@ struct hl_device {
u8 supports_ctx_switch; u8 supports_ctx_switch;
u8 support_preboot_binning; u8 support_preboot_binning;
/* Parameters for bring-up */ /* Parameters for bring-up to be upstreamed */
u64 nic_ports_mask; u64 nic_ports_mask;
u64 fw_components; u64 fw_components;
u8 mmu_enable; u8 mmu_enable;
@ -3450,6 +3509,20 @@ struct hl_cs_encaps_sig_handle {
u32 count; u32 count;
}; };
/**
* struct hl_info_fw_err_info - firmware error information structure
* @err_type: The type of error detected (or reported).
* @event_mask: Pointer to the event mask to be modified with the detected error flag
* (can be NULL)
* @event_id: The id of the event that reported the error
* (applicable when err_type is HL_INFO_FW_REPORTED_ERR).
*/
struct hl_info_fw_err_info {
enum hl_info_fw_err_type err_type;
u64 *event_mask;
u16 event_id;
};
/* /*
* IOCTLs * IOCTLs
*/ */
@ -3474,6 +3547,10 @@ struct hl_ioctl_desc {
hl_ioctl_t *func; hl_ioctl_t *func;
}; };
static inline bool hl_is_fw_ver_below_1_9(struct hl_device *hdev)
{
return (hdev->fw_major_version < 42);
}
/* /*
* Kernel module functions that can be accessed by entire module * Kernel module functions that can be accessed by entire module
@ -3537,14 +3614,12 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
} }
uint64_t hl_set_dram_bar_default(struct hl_device *hdev, u64 addr); uint64_t hl_set_dram_bar_default(struct hl_device *hdev, u64 addr);
void *hl_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle);
void hl_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr);
void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle,
gfp_t flag, const char *caller); gfp_t flag, const char *caller);
void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr, void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr,
dma_addr_t dma_handle, const char *caller); dma_addr_t dma_handle, const char *caller);
void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle, const char *caller);
void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr,
const char *caller);
void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags,
dma_addr_t *dma_handle, const char *caller); dma_addr_t *dma_handle, const char *caller);
void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr, void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr,
@ -3591,7 +3666,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg);
irqreturn_t hl_irq_handler_eq(int irq, void *arg); irqreturn_t hl_irq_handler_eq(int irq, void *arg);
irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg); irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg); irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg);
irqreturn_t hl_irq_handler_default(int irq, void *arg); irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg);
u32 hl_cq_inc_ptr(u32 ptr); u32 hl_cq_inc_ptr(u32 ptr);
int hl_asid_init(struct hl_device *hdev); int hl_asid_init(struct hl_device *hdev);
@ -3612,7 +3687,7 @@ int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr); void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr); void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
int hl_device_init(struct hl_device *hdev, struct class *hclass); int hl_device_init(struct hl_device *hdev);
void hl_device_fini(struct hl_device *hdev); void hl_device_fini(struct hl_device *hdev);
int hl_device_suspend(struct hl_device *hdev); int hl_device_suspend(struct hl_device *hdev);
int hl_device_resume(struct hl_device *hdev); int hl_device_resume(struct hl_device *hdev);
@ -3662,6 +3737,7 @@ bool cs_needs_timeout(struct hl_cs *cs);
bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs); bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq); struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
void hl_multi_cs_completion_init(struct hl_device *hdev); void hl_multi_cs_completion_init(struct hl_device *hdev);
u32 hl_get_active_cs_num(struct hl_device *hdev);
void goya_set_asic_funcs(struct hl_device *hdev); void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev); void gaudi_set_asic_funcs(struct hl_device *hdev);
@ -3861,6 +3937,7 @@ const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type);
void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg); void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg);
void hl_mem_mgr_fini(struct hl_mem_mgr *mmg); void hl_mem_mgr_fini(struct hl_mem_mgr *mmg);
void hl_mem_mgr_idr_destroy(struct hl_mem_mgr *mmg);
int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma, int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma,
void *args); void *args);
struct hl_mmap_mem_buf *hl_mmap_mem_buf_get(struct hl_mem_mgr *mmg, struct hl_mmap_mem_buf *hl_mmap_mem_buf_get(struct hl_mem_mgr *mmg,
@ -3879,6 +3956,8 @@ void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_o
void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
u64 *event_mask); u64 *event_mask);
void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask);
void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS

View file

@ -12,7 +12,6 @@
#include "../include/hw_ip/pci/pci_general.h" #include "../include/hw_ip/pci/pci_general.h"
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/aer.h>
#include <linux/module.h> #include <linux/module.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
@ -221,12 +220,9 @@ int hl_device_open(struct inode *inode, struct file *filp)
hl_debugfs_add_file(hpriv); hl_debugfs_add_file(hpriv);
memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info));
atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0);
atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0);
hdev->captured_err_info.undef_opcode.write_enable = true; hdev->captured_err_info.undef_opcode.write_enable = true;
hdev->captured_err_info.razwi_info.razwi_info_available = false;
hdev->captured_err_info.page_fault_info.page_fault_info_available = false;
hdev->open_counter++; hdev->open_counter++;
hdev->last_successful_open_jif = jiffies; hdev->last_successful_open_jif = jiffies;
@ -237,6 +233,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
out_err: out_err:
mutex_unlock(&hdev->fpriv_list_lock); mutex_unlock(&hdev->fpriv_list_lock);
hl_mem_mgr_fini(&hpriv->mem_mgr); hl_mem_mgr_fini(&hpriv->mem_mgr);
hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
filp->private_data = NULL; filp->private_data = NULL;
mutex_destroy(&hpriv->ctx_lock); mutex_destroy(&hpriv->ctx_lock);
@ -324,6 +321,7 @@ static void copy_kernel_module_params_to_device(struct hl_device *hdev)
hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type); hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
hdev->major = hl_major; hdev->major = hl_major;
hdev->hclass = hl_class;
hdev->memory_scrub = memory_scrub; hdev->memory_scrub = memory_scrub;
hdev->reset_on_lockup = reset_on_lockup; hdev->reset_on_lockup = reset_on_lockup;
hdev->boot_error_status_mask = boot_error_status_mask; hdev->boot_error_status_mask = boot_error_status_mask;
@ -550,9 +548,7 @@ static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
pci_set_drvdata(pdev, hdev); pci_set_drvdata(pdev, hdev);
pci_enable_pcie_error_reporting(pdev); rc = hl_device_init(hdev);
rc = hl_device_init(hdev, hl_class);
if (rc) { if (rc) {
dev_err(&pdev->dev, "Fatal error during habanalabs device init\n"); dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
rc = -ENODEV; rc = -ENODEV;
@ -562,7 +558,6 @@ static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return 0; return 0;
disable_device: disable_device:
pci_disable_pcie_error_reporting(pdev);
pci_set_drvdata(pdev, NULL); pci_set_drvdata(pdev, NULL);
destroy_hdev(hdev); destroy_hdev(hdev);
@ -585,7 +580,6 @@ static void hl_pci_remove(struct pci_dev *pdev)
return; return;
hl_device_fini(hdev); hl_device_fini(hdev);
pci_disable_pcie_error_reporting(pdev);
pci_set_drvdata(pdev, NULL); pci_set_drvdata(pdev, NULL);
destroy_hdev(hdev); destroy_hdev(hdev);
} }

View file

@ -102,11 +102,15 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
hw_ip.mme_master_slave_mode = prop->mme_master_slave_mode; hw_ip.mme_master_slave_mode = prop->mme_master_slave_mode;
hw_ip.first_available_interrupt_id = prop->first_available_user_interrupt; hw_ip.first_available_interrupt_id = prop->first_available_user_interrupt;
hw_ip.number_of_user_interrupts = prop->user_interrupt_count; hw_ip.number_of_user_interrupts = prop->user_interrupt_count;
hw_ip.tpc_interrupt_id = prop->tpc_interrupt_id;
hw_ip.edma_enabled_mask = prop->edma_enabled_mask; hw_ip.edma_enabled_mask = prop->edma_enabled_mask;
hw_ip.server_type = prop->server_type; hw_ip.server_type = prop->server_type;
hw_ip.security_enabled = prop->fw_security_enabled; hw_ip.security_enabled = prop->fw_security_enabled;
hw_ip.revision_id = hdev->pdev->revision; hw_ip.revision_id = hdev->pdev->revision;
hw_ip.rotator_enabled_mask = prop->rotator_enabled_mask;
hw_ip.engine_core_interrupt_reg_addr = prop->engine_core_interrupt_reg_addr;
hw_ip.reserved_dram_size = dram_kmd_size;
return copy_to_user(out, &hw_ip, return copy_to_user(out, &hw_ip,
min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0; min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
@ -830,6 +834,50 @@ static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0; return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0;
} }
static int hw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
struct hl_device *hdev = hpriv->hdev;
u32 user_buf_size = args->return_size;
struct hw_err_info *info;
int rc;
if ((!user_buf_size) || (!user_buf))
return -EINVAL;
if (user_buf_size < sizeof(struct hl_info_hw_err_event))
return -ENOMEM;
info = &hdev->captured_err_info.hw_err;
if (!info->event_info_available)
return -ENOENT;
rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_hw_err_event));
return rc ? -EFAULT : 0;
}
static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
struct hl_device *hdev = hpriv->hdev;
u32 user_buf_size = args->return_size;
struct fw_err_info *info;
int rc;
if ((!user_buf_size) || (!user_buf))
return -EINVAL;
if (user_buf_size < sizeof(struct hl_info_fw_err_event))
return -ENOMEM;
info = &hdev->captured_err_info.fw_err;
if (!info->event_info_available)
return -ENOENT;
rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_fw_err_event));
return rc ? -EFAULT : 0;
}
static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args) static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
{ {
void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer; void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer;
@ -950,6 +998,14 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_UNREGISTER_EVENTFD: case HL_INFO_UNREGISTER_EVENTFD:
return eventfd_unregister(hpriv, args); return eventfd_unregister(hpriv, args);
case HL_INFO_HW_ERR_EVENT:
return hw_err_info(hpriv, args);
case HL_INFO_FW_ERR_EVENT:
return fw_err_info(hpriv, args);
case HL_INFO_DRAM_USAGE:
return dram_usage_info(hpriv, args);
default: default:
break; break;
} }
@ -962,10 +1018,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
} }
switch (args->op) { switch (args->op) {
case HL_INFO_DRAM_USAGE:
rc = dram_usage_info(hpriv, args);
break;
case HL_INFO_HW_IDLE: case HL_INFO_HW_IDLE:
rc = hw_idle(hdev, args); rc = hw_idle(hdev, args);
break; break;

View file

@ -280,7 +280,6 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
struct list_head *ts_reg_free_list_head = NULL; struct list_head *ts_reg_free_list_head = NULL;
struct timestamp_reg_work_obj *job; struct timestamp_reg_work_obj *job;
bool reg_node_handle_fail = false; bool reg_node_handle_fail = false;
ktime_t now = ktime_get();
int rc; int rc;
/* For registration nodes: /* For registration nodes:
@ -303,13 +302,13 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
if (pend->ts_reg_info.buf) { if (pend->ts_reg_info.buf) {
if (!reg_node_handle_fail) { if (!reg_node_handle_fail) {
rc = handle_registration_node(hdev, pend, rc = handle_registration_node(hdev, pend,
&ts_reg_free_list_head, now); &ts_reg_free_list_head, intr->timestamp);
if (rc) if (rc)
reg_node_handle_fail = true; reg_node_handle_fail = true;
} }
} else { } else {
/* Handle wait target value node */ /* Handle wait target value node */
pend->fence.timestamp = now; pend->fence.timestamp = intr->timestamp;
complete_all(&pend->fence.completion); complete_all(&pend->fence.completion);
} }
} }
@ -326,6 +325,26 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
} }
} }
static void handle_tpc_interrupt(struct hl_device *hdev)
{
u64 event_mask;
u32 flags;
event_mask = HL_NOTIFIER_EVENT_TPC_ASSERT |
HL_NOTIFIER_EVENT_USER_ENGINE_ERR |
HL_NOTIFIER_EVENT_DEVICE_RESET;
flags = HL_DRV_RESET_DELAY;
dev_err_ratelimited(hdev->dev, "Received TPC assert\n");
hl_device_cond_reset(hdev, flags, event_mask);
}
static void handle_unexpected_user_interrupt(struct hl_device *hdev)
{
dev_err_ratelimited(hdev->dev, "Received unexpected user error interrupt\n");
}
/** /**
* hl_irq_handler_user_interrupt - irq handler for user interrupts * hl_irq_handler_user_interrupt - irq handler for user interrupts
* *
@ -334,6 +353,23 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
* *
*/ */
irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg) irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
{
struct hl_user_interrupt *user_int = arg;
user_int->timestamp = ktime_get();
return IRQ_WAKE_THREAD;
}
/**
* hl_irq_user_interrupt_thread_handler - irq thread handler for user interrupts.
* This function is invoked by threaded irq mechanism
*
* @irq: irq number
* @arg: pointer to user interrupt structure
*
*/
irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
{ {
struct hl_user_interrupt *user_int = arg; struct hl_user_interrupt *user_int = arg;
struct hl_device *hdev = user_int->hdev; struct hl_device *hdev = user_int->hdev;
@ -351,6 +387,12 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
/* Handle decoder interrupt registered on this specific irq */ /* Handle decoder interrupt registered on this specific irq */
handle_user_interrupt(hdev, user_int); handle_user_interrupt(hdev, user_int);
break; break;
case HL_USR_INTERRUPT_TPC:
handle_tpc_interrupt(hdev);
break;
case HL_USR_INTERRUPT_UNEXPECTED:
handle_unexpected_user_interrupt(hdev);
break;
default: default:
break; break;
} }
@ -358,24 +400,6 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
return IRQ_HANDLED; return IRQ_HANDLED;
} }
/**
* hl_irq_handler_default - default irq handler
*
* @irq: irq number
* @arg: pointer to user interrupt structure
*
*/
irqreturn_t hl_irq_handler_default(int irq, void *arg)
{
struct hl_user_interrupt *user_interrupt = arg;
struct hl_device *hdev = user_interrupt->hdev;
u32 interrupt_id = user_interrupt->interrupt_id;
dev_err(hdev->dev, "got invalid user interrupt %u", interrupt_id);
return IRQ_HANDLED;
}
/** /**
* hl_irq_handler_eq - irq handler for event queue * hl_irq_handler_eq - irq handler for event queue
* *
@ -405,11 +429,10 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
cur_eqe_index = FIELD_GET(EQ_CTL_INDEX_MASK, cur_eqe); cur_eqe_index = FIELD_GET(EQ_CTL_INDEX_MASK, cur_eqe);
if ((hdev->event_queue.check_eqe_index) && if ((hdev->event_queue.check_eqe_index) &&
(((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK) (((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK) != cur_eqe_index)) {
!= cur_eqe_index)) {
dev_dbg(hdev->dev, dev_dbg(hdev->dev,
"EQE 0x%x in queue is ready but index does not match %d!=%d", "EQE %#x in queue is ready but index does not match %d!=%d",
eq_base[eq->ci].hdr.ctl, cur_eqe,
((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK), ((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK),
cur_eqe_index); cur_eqe_index);
break; break;

View file

@ -235,10 +235,8 @@ static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
} }
rc = hl_pin_host_memory(hdev, addr, size, userptr); rc = hl_pin_host_memory(hdev, addr, size, userptr);
if (rc) { if (rc)
dev_err(hdev->dev, "Failed to pin host memory\n");
goto pin_err; goto pin_err;
}
userptr->dma_mapped = true; userptr->dma_mapped = true;
userptr->dir = DMA_BIDIRECTIONAL; userptr->dir = DMA_BIDIRECTIONAL;
@ -1097,10 +1095,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device
huge_page_size = hdev->asic_prop.pmmu_huge.page_size; huge_page_size = hdev->asic_prop.pmmu_huge.page_size;
rc = dma_map_host_va(hdev, addr, size, &userptr); rc = dma_map_host_va(hdev, addr, size, &userptr);
if (rc) { if (rc)
dev_err(hdev->dev, "failed to get userptr from va\n");
return rc; return rc;
}
rc = init_phys_pg_pack_from_userptr(ctx, userptr, rc = init_phys_pg_pack_from_userptr(ctx, userptr,
&phys_pg_pack, false); &phys_pg_pack, false);
@ -1270,6 +1266,18 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device
return rc; return rc;
} }
/* Should be called while the context's mem_hash_lock is taken */
static struct hl_vm_hash_node *get_vm_hash_node_locked(struct hl_ctx *ctx, u64 vaddr)
{
struct hl_vm_hash_node *hnode;
hash_for_each_possible(ctx->mem_hash, hnode, node, vaddr)
if (vaddr == hnode->vaddr)
return hnode;
return NULL;
}
/** /**
* unmap_device_va() - unmap the given device virtual address. * unmap_device_va() - unmap the given device virtual address.
* @ctx: pointer to the context structure. * @ctx: pointer to the context structure.
@ -1285,10 +1293,10 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
{ {
struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
u64 vaddr = args->unmap.device_virt_addr; u64 vaddr = args->unmap.device_virt_addr;
struct hl_vm_hash_node *hnode = NULL;
struct asic_fixed_properties *prop; struct asic_fixed_properties *prop;
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
struct hl_userptr *userptr = NULL; struct hl_userptr *userptr = NULL;
struct hl_vm_hash_node *hnode;
struct hl_va_range *va_range; struct hl_va_range *va_range;
enum vm_type *vm_type; enum vm_type *vm_type;
bool is_userptr; bool is_userptr;
@ -1298,15 +1306,10 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
/* protect from double entrance */ /* protect from double entrance */
mutex_lock(&ctx->mem_hash_lock); mutex_lock(&ctx->mem_hash_lock);
hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr) hnode = get_vm_hash_node_locked(ctx, vaddr);
if (vaddr == hnode->vaddr)
break;
if (!hnode) { if (!hnode) {
mutex_unlock(&ctx->mem_hash_lock); mutex_unlock(&ctx->mem_hash_lock);
dev_err(hdev->dev, dev_err(hdev->dev, "unmap failed, no mem hnode for vaddr 0x%llx\n", vaddr);
"unmap failed, no mem hnode for vaddr 0x%llx\n",
vaddr);
return -EINVAL; return -EINVAL;
} }
@ -1779,6 +1782,44 @@ static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
kfree(sgt); kfree(sgt);
} }
static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 addr)
{
struct hl_device *hdev = ctx->hdev;
struct hl_vm_hash_node *hnode;
/* get the memory handle */
mutex_lock(&ctx->mem_hash_lock);
hnode = get_vm_hash_node_locked(ctx, addr);
if (!hnode) {
mutex_unlock(&ctx->mem_hash_lock);
dev_dbg(hdev->dev, "map address %#llx not found\n", addr);
return ERR_PTR(-EINVAL);
}
if (upper_32_bits(hnode->handle)) {
mutex_unlock(&ctx->mem_hash_lock);
dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",
hnode->handle, addr);
return ERR_PTR(-EINVAL);
}
/*
* node found, increase export count so this memory cannot be unmapped
* and the hash node cannot be deleted.
*/
hnode->export_cnt++;
mutex_unlock(&ctx->mem_hash_lock);
return hnode;
}
static void memhash_node_export_put(struct hl_ctx *ctx, struct hl_vm_hash_node *hnode)
{
mutex_lock(&ctx->mem_hash_lock);
hnode->export_cnt--;
mutex_unlock(&ctx->mem_hash_lock);
}
static void hl_release_dmabuf(struct dma_buf *dmabuf) static void hl_release_dmabuf(struct dma_buf *dmabuf)
{ {
struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv; struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
@ -1789,13 +1830,15 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
ctx = hl_dmabuf->ctx; ctx = hl_dmabuf->ctx;
if (hl_dmabuf->memhash_hnode) { if (hl_dmabuf->memhash_hnode)
mutex_lock(&ctx->mem_hash_lock); memhash_node_export_put(ctx, hl_dmabuf->memhash_hnode);
hl_dmabuf->memhash_hnode->export_cnt--;
mutex_unlock(&ctx->mem_hash_lock);
}
atomic_dec(&ctx->hdev->dmabuf_export_cnt);
hl_ctx_put(ctx); hl_ctx_put(ctx);
/* Paired with get_file() in export_dmabuf() */
fput(ctx->hpriv->filp);
kfree(hl_dmabuf); kfree(hl_dmabuf);
} }
@ -1834,6 +1877,13 @@ static int export_dmabuf(struct hl_ctx *ctx,
hl_dmabuf->ctx = ctx; hl_dmabuf->ctx = ctx;
hl_ctx_get(hl_dmabuf->ctx); hl_ctx_get(hl_dmabuf->ctx);
atomic_inc(&ctx->hdev->dmabuf_export_cnt);
/* Get compute device file to enforce release order, such that all exported dma-buf will be
* released first and only then the compute device.
* Paired with fput() in hl_release_dmabuf().
*/
get_file(ctx->hpriv->filp);
*dmabuf_fd = fd; *dmabuf_fd = fd;
@ -1933,47 +1983,6 @@ static int validate_export_params(struct hl_device *hdev, u64 device_addr, u64 s
return 0; return 0;
} }
static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 addr)
{
struct hl_device *hdev = ctx->hdev;
struct hl_vm_hash_node *hnode;
/* get the memory handle */
mutex_lock(&ctx->mem_hash_lock);
hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)addr)
if (addr == hnode->vaddr)
break;
if (!hnode) {
mutex_unlock(&ctx->mem_hash_lock);
dev_dbg(hdev->dev, "map address %#llx not found\n", addr);
return ERR_PTR(-EINVAL);
}
if (upper_32_bits(hnode->handle)) {
mutex_unlock(&ctx->mem_hash_lock);
dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",
hnode->handle, addr);
return ERR_PTR(-EINVAL);
}
/*
* node found, increase export count so this memory cannot be unmapped
* and the hash node cannot be deleted.
*/
hnode->export_cnt++;
mutex_unlock(&ctx->mem_hash_lock);
return hnode;
}
static void memhash_node_export_put(struct hl_ctx *ctx, struct hl_vm_hash_node *hnode)
{
mutex_lock(&ctx->mem_hash_lock);
hnode->export_cnt--;
mutex_unlock(&ctx->mem_hash_lock);
}
static struct hl_vm_phys_pg_pack *get_phys_pg_pack_from_hash_node(struct hl_device *hdev, static struct hl_vm_phys_pg_pack *get_phys_pg_pack_from_hash_node(struct hl_device *hdev,
struct hl_vm_hash_node *hnode) struct hl_vm_hash_node *hnode)
{ {
@ -2221,11 +2230,11 @@ static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {
* allocate_timestamps_buffers() - allocate timestamps buffers * allocate_timestamps_buffers() - allocate timestamps buffers
* This function will allocate ts buffer that will later on be mapped to the user * This function will allocate ts buffer that will later on be mapped to the user
* in order to be able to read the timestamp. * in order to be able to read the timestamp.
* in additon it'll allocate an extra buffer for registration management. * in addition it'll allocate an extra buffer for registration management.
* since we cannot fail during registration for out-of-memory situation, so * since we cannot fail during registration for out-of-memory situation, so
* we'll prepare a pool which will be used as user interrupt nodes and instead * we'll prepare a pool which will be used as user interrupt nodes and instead
* of dynamically allocating nodes while registration we'll pick the node from * of dynamically allocating nodes while registration we'll pick the node from
* this pool. in addtion it'll add node to the mapping hash which will be used * this pool. in addition it'll add node to the mapping hash which will be used
* to map user ts buffer to the internal kernel ts buffer. * to map user ts buffer to the internal kernel ts buffer.
* @hpriv: pointer to the private data of the fd * @hpriv: pointer to the private data of the fd
* @args: ioctl input * @args: ioctl input

View file

@ -275,7 +275,7 @@ int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma,
if (atomic_cmpxchg(&buf->mmap, 0, 1)) { if (atomic_cmpxchg(&buf->mmap, 0, 1)) {
dev_err(mmg->dev, dev_err(mmg->dev,
"%s, Memory mmap failed, already mmaped to user\n", "%s, Memory mmap failed, already mapped to user\n",
buf->behavior->topic); buf->behavior->topic);
rc = -EINVAL; rc = -EINVAL;
goto put_mem; goto put_mem;
@ -341,8 +341,19 @@ void hl_mem_mgr_fini(struct hl_mem_mgr *mmg)
"%s: Buff handle %u for CTX is still alive\n", "%s: Buff handle %u for CTX is still alive\n",
topic, id); topic, id);
} }
}
/* TODO: can it happen that some buffer is still in use at this point? */ /**
* hl_mem_mgr_idr_destroy() - destroy memory manager IDR.
* @mmg: parent unified memory manager
*
* Destroy the memory manager IDR.
* Shall be called when IDR is empty and no memory buffers are in use.
*/
void hl_mem_mgr_idr_destroy(struct hl_mem_mgr *mmg)
{
if (!idr_is_empty(&mmg->handles))
dev_crit(mmg->dev, "memory manager IDR is destroyed while it is not empty!\n");
idr_destroy(&mmg->handles); idr_destroy(&mmg->handles);
} }

View file

@ -540,8 +540,8 @@ static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr,
u32 page_off; u32 page_off;
/* /*
* Bit arithmetics cannot be used for non power of two page * Bit arithmetic cannot be used for non power of two page
* sizes. In addition, since bit arithmetics is not used, * sizes. In addition, since bit arithmetic is not used,
* we cannot ignore dram base. All that shall be considered. * we cannot ignore dram base. All that shall be considered.
*/ */
@ -757,7 +757,7 @@ u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
* @mmu_prop: MMU properties. * @mmu_prop: MMU properties.
* @hop_idx: HOP index. * @hop_idx: HOP index.
* @hop_addr: HOP address. * @hop_addr: HOP address.
* @virt_addr: virtual address fro the translation. * @virt_addr: virtual address for the translation.
* *
* @return the matching PTE value on success, otherwise U64_MAX. * @return the matching PTE value on success, otherwise U64_MAX.
*/ */

View file

@ -502,7 +502,7 @@ int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset,
int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset, int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset,
u32 num_instances, u32 instance_offset, u32 num_instances, u32 instance_offset,
const u32 pb_blocks[], u32 blocks_array_size, const u32 pb_blocks[], u32 blocks_array_size,
const struct range *regs_range_array, u32 regs_range_array_size) const struct range *user_regs_range_array, u32 user_regs_range_array_size)
{ {
int i; int i;
struct hl_block_glbl_sec *glbl_sec; struct hl_block_glbl_sec *glbl_sec;
@ -514,8 +514,8 @@ int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset,
return -ENOMEM; return -ENOMEM;
hl_secure_block(hdev, glbl_sec, blocks_array_size); hl_secure_block(hdev, glbl_sec, blocks_array_size);
hl_unsecure_registers_range(hdev, regs_range_array, hl_unsecure_registers_range(hdev, user_regs_range_array,
regs_range_array_size, 0, pb_blocks, glbl_sec, user_regs_range_array_size, 0, pb_blocks, glbl_sec,
blocks_array_size); blocks_array_size);
/* Fill all blocks with the same configuration */ /* Fill all blocks with the same configuration */

View file

@ -10,7 +10,7 @@
#include <linux/io-64-nonatomic-lo-hi.h> #include <linux/io-64-nonatomic-lo-hi.h>
extern struct hl_device *hdev; struct hl_device;
/* special blocks */ /* special blocks */
#define HL_MAX_NUM_OF_GLBL_ERR_CAUSE 10 #define HL_MAX_NUM_OF_GLBL_ERR_CAUSE 10

View file

@ -656,6 +656,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
prop->cfg_size = CFG_SIZE; prop->cfg_size = CFG_SIZE;
prop->max_asid = MAX_ASID; prop->max_asid = MAX_ASID;
prop->num_of_events = GAUDI_EVENT_SIZE; prop->num_of_events = GAUDI_EVENT_SIZE;
prop->max_num_of_engines = GAUDI_ENGINE_ID_SIZE;
prop->tpc_enabled_mask = TPC_ENABLED_MASK; prop->tpc_enabled_mask = TPC_ENABLED_MASK;
set_default_power_values(hdev); set_default_power_values(hdev);
@ -679,6 +680,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
(num_sync_stream_queues * HL_RSVD_MONS); (num_sync_stream_queues * HL_RSVD_MONS);
prop->first_available_user_interrupt = USHRT_MAX; prop->first_available_user_interrupt = USHRT_MAX;
prop->tpc_interrupt_id = USHRT_MAX;
for (i = 0 ; i < HL_MAX_DCORES ; i++) for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX; prop->first_available_cq[i] = USHRT_MAX;
@ -867,13 +869,18 @@ static int gaudi_early_init(struct hl_device *hdev)
rc = hl_fw_read_preboot_status(hdev); rc = hl_fw_read_preboot_status(hdev);
if (rc) { if (rc) {
if (hdev->reset_on_preboot_fail) if (hdev->reset_on_preboot_fail)
/* we are already on failure flow, so don't check if hw_fini fails. */
hdev->asic_funcs->hw_fini(hdev, true, false); hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini; goto pci_fini;
} }
if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n"); dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
hdev->asic_funcs->hw_fini(hdev, true, false); rc = hdev->asic_funcs->hw_fini(hdev, true, false);
if (rc) {
dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
goto pci_fini;
}
} }
return 0; return 0;
@ -3718,7 +3725,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
"failed to set hop0 addr for asid %d\n", i); "failed to set hop0 addr for asid %d\n", i);
goto err; return rc;
} }
} }
@ -3729,7 +3736,9 @@ static int gaudi_mmu_init(struct hl_device *hdev)
/* mem cache invalidation */ /* mem cache invalidation */
WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1); WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1);
hl_mmu_invalidate_cache(hdev, true, 0); rc = hl_mmu_invalidate_cache(hdev, true, 0);
if (rc)
return rc;
WREG32(mmMMU_UP_MMU_ENABLE, 1); WREG32(mmMMU_UP_MMU_ENABLE, 1);
WREG32(mmMMU_UP_SPI_MASK, 0xF); WREG32(mmMMU_UP_SPI_MASK, 0xF);
@ -3745,9 +3754,6 @@ static int gaudi_mmu_init(struct hl_device *hdev)
gaudi->hw_cap_initialized |= HW_CAP_MMU; gaudi->hw_cap_initialized |= HW_CAP_MMU;
return 0; return 0;
err:
return rc;
} }
static int gaudi_load_firmware_to_device(struct hl_device *hdev) static int gaudi_load_firmware_to_device(struct hl_device *hdev)
@ -4068,7 +4074,7 @@ static int gaudi_hw_init(struct hl_device *hdev)
return rc; return rc;
} }
static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset) static int gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
{ {
struct cpu_dyn_regs *dyn_regs = struct cpu_dyn_regs *dyn_regs =
&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
@ -4078,7 +4084,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
if (!hard_reset) { if (!hard_reset) {
dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n"); dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n");
return; return 0;
} }
if (hdev->pldm) { if (hdev->pldm) {
@ -4199,10 +4205,10 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
msleep(reset_timeout_ms); msleep(reset_timeout_ms);
status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM); status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM);
if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) {
dev_err(hdev->dev, dev_err(hdev->dev, "Timeout while waiting for device to reset 0x%x\n", status);
"Timeout while waiting for device to reset 0x%x\n", return -ETIMEDOUT;
status); }
if (gaudi) { if (gaudi) {
gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q | HW_CAP_HBM | gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q | HW_CAP_HBM |
@ -4215,6 +4221,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
hdev->device_cpu_is_halted = false; hdev->device_cpu_is_halted = false;
} }
return 0;
} }
static int gaudi_suspend(struct hl_device *hdev) static int gaudi_suspend(struct hl_device *hdev)
@ -7297,7 +7304,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
} }
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
bool razwi, u64 *event_mask) bool check_razwi, u64 *event_mask)
{ {
bool is_read = false, is_write = false; bool is_read = false, is_write = false;
u16 engine_id[2], num_of_razwi_eng = 0; u16 engine_id[2], num_of_razwi_eng = 0;
@ -7316,7 +7323,7 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc); event_type, desc);
if (razwi) { if (check_razwi) {
gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read, gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read,
&is_write); &is_write);
gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask); gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask);
@ -7333,8 +7340,9 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
num_of_razwi_eng = 1; num_of_razwi_eng = 1;
} }
hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags, if (razwi_flags)
event_mask); hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng,
razwi_flags, event_mask);
} }
} }
@ -7633,6 +7641,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type,
static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{ {
struct gaudi_device *gaudi = hdev->asic_specific; struct gaudi_device *gaudi = hdev->asic_specific;
struct hl_info_fw_err_info fw_err_info;
u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0; u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0;
u32 ctl = le32_to_cpu(eq_entry->hdr.ctl); u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
u32 fw_fatal_err_flag = 0, flags = 0; u32 fw_fatal_err_flag = 0, flags = 0;
@ -7911,7 +7920,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_FW_ALIVE_S: case GAUDI_EVENT_FW_ALIVE_S:
gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive); gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; fw_err_info.err_type = HL_INFO_FW_REPORTED_ERR;
fw_err_info.event_id = event_type;
fw_err_info.event_mask = &event_mask;
hl_handle_fw_err(hdev, &fw_err_info);
goto reset_device; goto reset_device;
default: default:
@ -7942,6 +7954,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
} }
if (reset_required) { if (reset_required) {
/* escalate general hw errors to critical/fatal error */
if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
hl_handle_critical_hw_err(hdev, event_type, &event_mask);
hl_device_cond_reset(hdev, flags, event_mask); hl_device_cond_reset(hdev, flags, event_mask);
} else { } else {
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
@ -8403,19 +8419,26 @@ static int gaudi_internal_cb_pool_init(struct hl_device *hdev,
} }
mutex_lock(&hdev->mmu_lock); mutex_lock(&hdev->mmu_lock);
rc = hl_mmu_map_contiguous(ctx, hdev->internal_cb_va_base, rc = hl_mmu_map_contiguous(ctx, hdev->internal_cb_va_base,
hdev->internal_cb_pool_dma_addr, hdev->internal_cb_pool_dma_addr,
HOST_SPACE_INTERNAL_CB_SZ); HOST_SPACE_INTERNAL_CB_SZ);
hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
mutex_unlock(&hdev->mmu_lock);
if (rc) if (rc)
goto unreserve_internal_cb_pool; goto unreserve_internal_cb_pool;
rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
if (rc)
goto unmap_internal_cb_pool;
mutex_unlock(&hdev->mmu_lock);
return 0; return 0;
unmap_internal_cb_pool:
hl_mmu_unmap_contiguous(ctx, hdev->internal_cb_va_base,
HOST_SPACE_INTERNAL_CB_SZ);
unreserve_internal_cb_pool: unreserve_internal_cb_pool:
mutex_unlock(&hdev->mmu_lock);
hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base, hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base,
HOST_SPACE_INTERNAL_CB_SZ); HOST_SPACE_INTERNAL_CB_SZ);
destroy_internal_cb_pool: destroy_internal_cb_pool:

File diff suppressed because it is too large Load diff

View file

@ -387,6 +387,8 @@ enum gaudi2_edma_id {
* We have 64 CQ's per dcore, CQ0 in dcore 0 is reserved for legacy mode * We have 64 CQ's per dcore, CQ0 in dcore 0 is reserved for legacy mode
*/ */
#define GAUDI2_NUM_USER_INTERRUPTS 255 #define GAUDI2_NUM_USER_INTERRUPTS 255
#define GAUDI2_NUM_RESERVED_INTERRUPTS 1
#define GAUDI2_TOTAL_USER_INTERRUPTS (GAUDI2_NUM_USER_INTERRUPTS + GAUDI2_NUM_RESERVED_INTERRUPTS)
enum gaudi2_irq_num { enum gaudi2_irq_num {
GAUDI2_IRQ_NUM_EVENT_QUEUE = GAUDI2_EVENT_QUEUE_MSIX_IDX, GAUDI2_IRQ_NUM_EVENT_QUEUE = GAUDI2_EVENT_QUEUE_MSIX_IDX,
@ -410,12 +412,15 @@ enum gaudi2_irq_num {
GAUDI2_IRQ_NUM_SHARED_DEC0_ABNRM, GAUDI2_IRQ_NUM_SHARED_DEC0_ABNRM,
GAUDI2_IRQ_NUM_SHARED_DEC1_NRM, GAUDI2_IRQ_NUM_SHARED_DEC1_NRM,
GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM, GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM,
GAUDI2_IRQ_NUM_DEC_LAST = GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM,
GAUDI2_IRQ_NUM_COMPLETION, GAUDI2_IRQ_NUM_COMPLETION,
GAUDI2_IRQ_NUM_NIC_PORT_FIRST, GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1), GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
GAUDI2_IRQ_NUM_TPC_ASSERT,
GAUDI2_IRQ_NUM_RESERVED_FIRST, GAUDI2_IRQ_NUM_RESERVED_FIRST,
GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_NUM_USER_INTERRUPTS - 1), GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
GAUDI2_IRQ_NUM_USER_FIRST, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
GAUDI2_IRQ_NUM_USER_FIRST = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR + 1,
GAUDI2_IRQ_NUM_USER_LAST = (GAUDI2_IRQ_NUM_USER_FIRST + GAUDI2_NUM_USER_INTERRUPTS - 1), GAUDI2_IRQ_NUM_USER_LAST = (GAUDI2_IRQ_NUM_USER_FIRST + GAUDI2_NUM_USER_INTERRUPTS - 1),
GAUDI2_IRQ_NUM_LAST = (GAUDI2_MSIX_ENTRIES - 1) GAUDI2_IRQ_NUM_LAST = (GAUDI2_MSIX_ENTRIES - 1)
}; };

View file

@ -2657,7 +2657,7 @@ int gaudi2_coresight_init(struct hl_device *hdev)
/* /*
* Mask out all the disabled binned offsets. * Mask out all the disabled binned offsets.
* so when user request to configure a binned or masked out component, * so when user request to configure a binned or masked out component,
* driver will ignore programing it ( happens when offset value is set to 0x0 ) * driver will ignore programming it ( happens when offset value is set to 0x0 )
* this is being set in gaudi2_coresight_set_disabled_components * this is being set in gaudi2_coresight_set_disabled_components
*/ */

View file

@ -79,7 +79,6 @@
DCORE0_MME_CTRL_LO_ARCH_STATUS_QM_RDY_MASK) DCORE0_MME_CTRL_LO_ARCH_STATUS_QM_RDY_MASK)
#define TPC_IDLE_MASK (DCORE0_TPC0_CFG_STATUS_SCALAR_PIPE_EMPTY_MASK | \ #define TPC_IDLE_MASK (DCORE0_TPC0_CFG_STATUS_SCALAR_PIPE_EMPTY_MASK | \
DCORE0_TPC0_CFG_STATUS_VECTOR_PIPE_EMPTY_MASK | \
DCORE0_TPC0_CFG_STATUS_IQ_EMPTY_MASK | \ DCORE0_TPC0_CFG_STATUS_IQ_EMPTY_MASK | \
DCORE0_TPC0_CFG_STATUS_SB_EMPTY_MASK | \ DCORE0_TPC0_CFG_STATUS_SB_EMPTY_MASK | \
DCORE0_TPC0_CFG_STATUS_QM_IDLE_MASK | \ DCORE0_TPC0_CFG_STATUS_QM_IDLE_MASK | \
@ -87,6 +86,8 @@
#define DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK 0x100 #define DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK 0x100
#define DCORE0_TPC0_EML_CFG_DBG_CNT_DBG_EXIT_MASK 0x40
/* CGM_IDLE_MASK is valid for all engines CGM idle check */ /* CGM_IDLE_MASK is valid for all engines CGM idle check */
#define CGM_IDLE_MASK DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK #define CGM_IDLE_MASK DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK

View file

@ -1595,6 +1595,7 @@ static const u32 gaudi2_pb_dcr0_tpc0_unsecured_regs[] = {
mmDCORE0_TPC0_CFG_KERNEL_SRF_30, mmDCORE0_TPC0_CFG_KERNEL_SRF_30,
mmDCORE0_TPC0_CFG_KERNEL_SRF_31, mmDCORE0_TPC0_CFG_KERNEL_SRF_31,
mmDCORE0_TPC0_CFG_TPC_SB_L0CD, mmDCORE0_TPC0_CFG_TPC_SB_L0CD,
mmDCORE0_TPC0_CFG_TPC_ID,
mmDCORE0_TPC0_CFG_QM_KERNEL_ID_INC, mmDCORE0_TPC0_CFG_QM_KERNEL_ID_INC,
mmDCORE0_TPC0_CFG_QM_TID_BASE_SIZE_HIGH_DIM_0, mmDCORE0_TPC0_CFG_QM_TID_BASE_SIZE_HIGH_DIM_0,
mmDCORE0_TPC0_CFG_QM_TID_BASE_SIZE_HIGH_DIM_1, mmDCORE0_TPC0_CFG_QM_TID_BASE_SIZE_HIGH_DIM_1,

View file

@ -472,6 +472,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
prop->max_pending_cs = GOYA_MAX_PENDING_CS; prop->max_pending_cs = GOYA_MAX_PENDING_CS;
prop->first_available_user_interrupt = USHRT_MAX; prop->first_available_user_interrupt = USHRT_MAX;
prop->tpc_interrupt_id = USHRT_MAX;
for (i = 0 ; i < HL_MAX_DCORES ; i++) for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX; prop->first_available_cq[i] = USHRT_MAX;
@ -668,13 +669,18 @@ static int goya_early_init(struct hl_device *hdev)
rc = hl_fw_read_preboot_status(hdev); rc = hl_fw_read_preboot_status(hdev);
if (rc) { if (rc) {
if (hdev->reset_on_preboot_fail) if (hdev->reset_on_preboot_fail)
/* we are already on failure flow, so don't check if hw_fini fails. */
hdev->asic_funcs->hw_fini(hdev, true, false); hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini; goto pci_fini;
} }
if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n"); dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
hdev->asic_funcs->hw_fini(hdev, true, false); rc = hdev->asic_funcs->hw_fini(hdev, true, false);
if (rc) {
dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
goto pci_fini;
}
} }
if (!hdev->pldm) { if (!hdev->pldm) {
@ -2782,7 +2788,7 @@ static int goya_hw_init(struct hl_device *hdev)
return rc; return rc;
} }
static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset) static int goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
{ {
struct goya_device *goya = hdev->asic_specific; struct goya_device *goya = hdev->asic_specific;
u32 reset_timeout_ms, cpu_timeout_ms, status; u32 reset_timeout_ms, cpu_timeout_ms, status;
@ -2828,17 +2834,17 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
msleep(reset_timeout_ms); msleep(reset_timeout_ms);
status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM); status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM);
if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) {
dev_err(hdev->dev, dev_err(hdev->dev, "Timeout while waiting for device to reset 0x%x\n", status);
"Timeout while waiting for device to reset 0x%x\n", return -ETIMEDOUT;
status); }
if (!hard_reset && goya) { if (!hard_reset && goya) {
goya->hw_cap_initialized &= ~(HW_CAP_DMA | HW_CAP_MME | goya->hw_cap_initialized &= ~(HW_CAP_DMA | HW_CAP_MME |
HW_CAP_GOLDEN | HW_CAP_TPC); HW_CAP_GOLDEN | HW_CAP_TPC);
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GOYA_ASYNC_EVENT_ID_SOFT_RESET); GOYA_ASYNC_EVENT_ID_SOFT_RESET);
return; return 0;
} }
/* Chicken bit to re-initiate boot sequencer flow */ /* Chicken bit to re-initiate boot sequencer flow */
@ -2857,6 +2863,7 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
memset(goya->events_stat, 0, sizeof(goya->events_stat)); memset(goya->events_stat, 0, sizeof(goya->events_stat));
} }
return 0;
} }
int goya_suspend(struct hl_device *hdev) int goya_suspend(struct hl_device *hdev)

View file

@ -357,6 +357,7 @@ struct hl_eq_addr_dec_intr_data {
struct hl_eq_entry { struct hl_eq_entry {
struct hl_eq_header hdr; struct hl_eq_header hdr;
union { union {
__le64 data_placeholder;
struct hl_eq_ecc_data ecc_data; struct hl_eq_ecc_data ecc_data;
struct hl_eq_hbm_ecc_data hbm_ecc_data; /* Gaudi1 HBM */ struct hl_eq_hbm_ecc_data hbm_ecc_data; /* Gaudi1 HBM */
struct hl_eq_sm_sei_data sm_sei_data; struct hl_eq_sm_sei_data sm_sei_data;
@ -661,6 +662,9 @@ enum pq_init_status {
* CPUCP_PACKET_ACTIVE_STATUS_SET - * CPUCP_PACKET_ACTIVE_STATUS_SET -
* LKD sends FW indication whether device is free or in use, this indication is reported * LKD sends FW indication whether device is free or in use, this indication is reported
* also to the BMC. * also to the BMC.
*
* CPUCP_PACKET_REGISTER_INTERRUPTS -
* Packet to register interrupts indicating LKD is ready to receive events from FW.
*/ */
enum cpucp_packet_id { enum cpucp_packet_id {
@ -725,6 +729,8 @@ enum cpucp_packet_id {
CPUCP_PACKET_RESERVED9, /* not used */ CPUCP_PACKET_RESERVED9, /* not used */
CPUCP_PACKET_RESERVED10, /* not used */ CPUCP_PACKET_RESERVED10, /* not used */
CPUCP_PACKET_RESERVED11, /* not used */ CPUCP_PACKET_RESERVED11, /* not used */
CPUCP_PACKET_RESERVED12, /* internal */
CPUCP_PACKET_REGISTER_INTERRUPTS, /* internal */
CPUCP_PACKET_ID_MAX /* must be last */ CPUCP_PACKET_ID_MAX /* must be last */
}; };
@ -1127,6 +1133,7 @@ struct cpucp_security_info {
* (0 = functional 1 = binned) * (0 = functional 1 = binned)
* @interposer_version: Interposer version programmed in eFuse * @interposer_version: Interposer version programmed in eFuse
* @substrate_version: Substrate version programmed in eFuse * @substrate_version: Substrate version programmed in eFuse
* @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
* @fw_os_version: Firmware OS Version * @fw_os_version: Firmware OS Version
*/ */
struct cpucp_info { struct cpucp_info {
@ -1154,7 +1161,7 @@ struct cpucp_info {
__u8 substrate_version; __u8 substrate_version;
__u8 reserved2; __u8 reserved2;
struct cpucp_security_info sec_info; struct cpucp_security_info sec_info;
__le32 reserved3; __le32 fw_hbm_region_size;
__u8 pll_map[PLL_MAP_LEN]; __u8 pll_map[PLL_MAP_LEN];
__le64 mme_binning_mask; __le64 mme_binning_mask;
__u8 fw_os_version[VERSION_MAX_LEN]; __u8 fw_os_version[VERSION_MAX_LEN];

View file

@ -770,15 +770,23 @@ enum hl_components {
HL_COMPONENTS_ARMCP, HL_COMPONENTS_ARMCP,
HL_COMPONENTS_CPLD, HL_COMPONENTS_CPLD,
HL_COMPONENTS_UBOOT, HL_COMPONENTS_UBOOT,
HL_COMPONENTS_FUSE,
HL_COMPONENTS_MAX_NUM = 16 HL_COMPONENTS_MAX_NUM = 16
}; };
#define NAME_MAX_LEN 32 /* bytes */
struct hl_module_data {
__u8 name[NAME_MAX_LEN];
__u8 version[VERSION_MAX_LEN];
};
/** /**
* struct hl_component_versions - versions associated with hl component. * struct hl_component_versions - versions associated with hl component.
* @struct_size: size of all the struct (including dynamic size of modules). * @struct_size: size of all the struct (including dynamic size of modules).
* @modules_offset: offset of the modules field in this struct. * @modules_offset: offset of the modules field in this struct.
* @component: version of the component itself. * @component: version of the component itself.
* @fw_os: Firmware OS Version. * @fw_os: Firmware OS Version.
* @comp_name: Name of the component.
* @modules_mask: i'th bit (from LSB) is a flag - on if module i in enum * @modules_mask: i'th bit (from LSB) is a flag - on if module i in enum
* hl_modules is used. * hl_modules is used.
* @modules_counter: number of set bits in modules_mask. * @modules_counter: number of set bits in modules_mask.
@ -791,45 +799,14 @@ struct hl_component_versions {
__le16 modules_offset; __le16 modules_offset;
__u8 component[VERSION_MAX_LEN]; __u8 component[VERSION_MAX_LEN];
__u8 fw_os[VERSION_MAX_LEN]; __u8 fw_os[VERSION_MAX_LEN];
__u8 comp_name[NAME_MAX_LEN];
__le16 modules_mask; __le16 modules_mask;
__u8 modules_counter; __u8 modules_counter;
__u8 reserved[1]; __u8 reserved[1];
__u8 modules[][VERSION_MAX_LEN]; struct hl_module_data modules[];
}; };
/** /* Max size of fit size */
* struct hl_fw_versions - all versions (fuse, cpucp's components with their #define HL_FW_VERSIONS_FIT_SIZE 4096
* modules)
* @struct_size: size of all the struct (including dynamic size of components).
* @components_offset: offset of the components field in this struct.
* @fuse: silicon production FUSE information.
* @components_mask: i'th bit (from LSB) is a flag - on if component i in enum
* hl_components is used.
* @components_counter: number of set bits in components_mask.
* @reserved: reserved for future use.
* @components: versions of hl components. Index i corresponds to the i'th bit
* that is *on* in components_mask. For example, if
* components_mask=0b101, then *components represents arcpid and
* *(hl_component_versions*)((char*)components + 1') represents
* preboot, where 1' = components[0].struct_size.
*/
struct hl_fw_versions {
__le16 struct_size;
__le16 components_offset;
__u8 fuse[VERSION_MAX_LEN];
__le16 components_mask;
__u8 components_counter;
__u8 reserved[1];
struct hl_component_versions components[];
};
/* Max size of struct hl_component_versions */
#define HL_COMPONENT_VERSIONS_MAX_SIZE \
(sizeof(struct hl_component_versions) + HL_MODULES_MAX_NUM * \
VERSION_MAX_LEN)
/* Max size of struct hl_fw_versions */
#define HL_FW_VERSIONS_MAX_SIZE (sizeof(struct hl_fw_versions) + \
HL_COMPONENTS_MAX_NUM * HL_COMPONENT_VERSIONS_MAX_SIZE)
#endif /* HL_BOOT_IF_H */ #endif /* HL_BOOT_IF_H */

View file

@ -164,6 +164,8 @@
#define mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR 0x4800040 #define mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR 0x4800040
#define mmDCORE0_TPC0_EML_CFG_DBG_CNT 0x40000
#define SM_OBJS_PROT_BITS_OFFS 0x14000 #define SM_OBJS_PROT_BITS_OFFS 0x14000
#define DCORE_OFFSET (mmDCORE1_TPC0_QM_BASE - mmDCORE0_TPC0_QM_BASE) #define DCORE_OFFSET (mmDCORE1_TPC0_QM_BASE - mmDCORE0_TPC0_QM_BASE)
@ -185,7 +187,10 @@
#define TPC_CFG_STALL_ON_ERR_OFFSET (mmDCORE0_TPC0_CFG_STALL_ON_ERR - mmDCORE0_TPC0_CFG_BASE) #define TPC_CFG_STALL_ON_ERR_OFFSET (mmDCORE0_TPC0_CFG_STALL_ON_ERR - mmDCORE0_TPC0_CFG_BASE)
#define TPC_CFG_TPC_INTR_MASK_OFFSET (mmDCORE0_TPC0_CFG_TPC_INTR_MASK - mmDCORE0_TPC0_CFG_BASE) #define TPC_CFG_TPC_INTR_MASK_OFFSET (mmDCORE0_TPC0_CFG_TPC_INTR_MASK - mmDCORE0_TPC0_CFG_BASE)
#define TPC_CFG_MSS_CONFIG_OFFSET (mmDCORE0_TPC0_CFG_MSS_CONFIG - mmDCORE0_TPC0_CFG_BASE) #define TPC_CFG_MSS_CONFIG_OFFSET (mmDCORE0_TPC0_CFG_MSS_CONFIG - mmDCORE0_TPC0_CFG_BASE)
#define TPC_EML_CFG_DBG_CNT_OFFSET (mmDCORE0_TPC0_EML_CFG_DBG_CNT - mmDCORE0_TPC0_EML_CFG_BASE)
#define EDMA_CORE_CFG_STALL_OFFSET (mmDCORE0_EDMA0_CORE_CFG_1 - mmDCORE0_EDMA0_CORE_BASE)
#define MME_CTRL_LO_QM_STALL_OFFSET (mmDCORE0_MME_CTRL_LO_QM_STALL - mmDCORE0_MME_CTRL_LO_BASE)
#define MME_ACC_INTR_MASK_OFFSET (mmDCORE0_MME_ACC_INTR_MASK - mmDCORE0_MME_ACC_BASE) #define MME_ACC_INTR_MASK_OFFSET (mmDCORE0_MME_ACC_INTR_MASK - mmDCORE0_MME_ACC_BASE)
#define MME_ACC_WR_AXI_AGG_COUT0_OFFSET (mmDCORE0_MME_ACC_WR_AXI_AGG_COUT0 - mmDCORE0_MME_ACC_BASE) #define MME_ACC_WR_AXI_AGG_COUT0_OFFSET (mmDCORE0_MME_ACC_WR_AXI_AGG_COUT0 - mmDCORE0_MME_ACC_BASE)
#define MME_ACC_WR_AXI_AGG_COUT1_OFFSET (mmDCORE0_MME_ACC_WR_AXI_AGG_COUT1 - mmDCORE0_MME_ACC_BASE) #define MME_ACC_WR_AXI_AGG_COUT1_OFFSET (mmDCORE0_MME_ACC_WR_AXI_AGG_COUT1 - mmDCORE0_MME_ACC_BASE)

View file

@ -63,6 +63,8 @@
#define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_START 0xFFF0F80000000000ull #define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_START 0xFFF0F80000000000ull
#define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_END 0xFFF0FFFFFFFFFFFFull #define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_END 0xFFF0FFFFFFFFFFFFull
#define RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT 256
#define GAUDI2_MSIX_ENTRIES 512 #define GAUDI2_MSIX_ENTRIES 512
#define QMAN_PQ_ENTRY_SIZE 16 /* Bytes */ #define QMAN_PQ_ENTRY_SIZE 16 /* Bytes */

View file

@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 /* SPDX-License-Identifier: GPL-2.0
* *
* Copyright 2018-2021 HabanaLabs, Ltd. * Copyright 2018-2022 HabanaLabs, Ltd.
* All Rights Reserved. * All Rights Reserved.
* *
*/ */
@ -958,7 +958,7 @@ enum gaudi2_async_event_id {
GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318, GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318,
GAUDI2_EVENT_ARC_DCCM_FULL = 1319, GAUDI2_EVENT_ARC_DCCM_FULL = 1319,
GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED = 1320, GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED = 1320,
GAUDI2_EVENT_DEV_RESET_REQ = 1321, GAUDI2_EVENT_CPU_DEV_RESET_REQ = 1321,
GAUDI2_EVENT_SIZE, GAUDI2_EVENT_SIZE,
}; };

View file

@ -63,7 +63,10 @@ struct gaudi2_cold_rst_data {
u32 fake_sig_validation_en : 1; u32 fake_sig_validation_en : 1;
u32 bist_skip_enable : 1; u32 bist_skip_enable : 1;
u32 bist_need_iatu_config : 1; u32 bist_need_iatu_config : 1;
u32 reserved : 24; u32 fake_bis_compliant : 1;
u32 wd_rst_cause_arm : 1;
u32 wd_rst_cause_arcpid : 1;
u32 reserved : 21;
}; };
__le32 data; __le32 data;
}; };

View file

@ -408,7 +408,8 @@ static inline bool drm_is_render_client(const struct drm_file *file_priv)
* Returns true if this is an open file of the compute acceleration node, i.e. * Returns true if this is an open file of the compute acceleration node, i.e.
* &drm_file.minor of @file_priv is a accel minor. * &drm_file.minor of @file_priv is a accel minor.
* *
* See also the :ref:`section on accel nodes <drm_accel_node>`. * See also :doc:`Introduction to compute accelerators subsystem
* </accel/introduction>`.
*/ */
static inline bool drm_is_accel_client(const struct drm_file *file_priv) static inline bool drm_is_accel_client(const struct drm_file *file_priv)
{ {

View file

@ -723,6 +723,10 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error * HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened * HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
* HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened * HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened
* HL_NOTIFIER_EVENT_CRITICAL_HW_ERR - Indicates a HW error that requires SW abort and
* HW reset
* HL_NOTIFIER_EVENT_CRITICAL_FW_ERR - Indicates a FW error that requires SW abort and
* HW reset
*/ */
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
@ -733,6 +737,8 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6) #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7) #define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8) #define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8)
#define HL_NOTIFIER_EVENT_CRITICL_HW_ERR (1ULL << 9)
#define HL_NOTIFIER_EVENT_CRITICL_FW_ERR (1ULL << 10)
/* Opcode for management ioctl /* Opcode for management ioctl
* *
@ -790,6 +796,8 @@ enum hl_server_type {
* HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault. * HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
* HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event. * HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
* HL_INFO_FW_GENERIC_REQ - Send generic request to FW. * HL_INFO_FW_GENERIC_REQ - Send generic request to FW.
* HL_INFO_HW_ERR_EVENT - Retrieve information on the reported HW error.
* HL_INFO_FW_ERR_EVENT - Retrieve information on the reported FW error.
*/ */
#define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1 #define HL_INFO_HW_EVENTS 1
@ -824,6 +832,8 @@ enum hl_server_type {
#define HL_INFO_PAGE_FAULT_EVENT 33 #define HL_INFO_PAGE_FAULT_EVENT 33
#define HL_INFO_USER_MAPPINGS 34 #define HL_INFO_USER_MAPPINGS 34
#define HL_INFO_FW_GENERIC_REQ 35 #define HL_INFO_FW_GENERIC_REQ 35
#define HL_INFO_HW_ERR_EVENT 36
#define HL_INFO_FW_ERR_EVENT 37
#define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16 #define HL_INFO_CARD_NAME_MAX_LEN 16
@ -875,6 +885,12 @@ enum hl_server_type {
* application to use. Relevant for Gaudi2 and later. * application to use. Relevant for Gaudi2 and later.
* @device_mem_alloc_default_page_size: default page size used in device memory allocation. * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
* @revision_id: PCI revision ID of the ASIC. * @revision_id: PCI revision ID of the ASIC.
* @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
* @rotator_enabled_mask: Bit-mask that represents which rotators are enabled.
* Relevant for Gaudi3 and later.
* @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
* in order to raise events toward FW.
* @reserved_dram_size: DRAM size reserved for driver and firmware.
*/ */
struct hl_info_hw_ip_info { struct hl_info_hw_ip_info {
__u64 sram_base_address; __u64 sram_base_address;
@ -902,15 +918,20 @@ struct hl_info_hw_ip_info {
__u64 dram_page_size; __u64 dram_page_size;
__u32 edma_enabled_mask; __u32 edma_enabled_mask;
__u16 number_of_user_interrupts; __u16 number_of_user_interrupts;
__u16 pad2; __u8 reserved1;
__u64 reserved4; __u8 reserved2;
__u64 reserved3;
__u64 device_mem_alloc_default_page_size; __u64 device_mem_alloc_default_page_size;
__u64 reserved4;
__u64 reserved5; __u64 reserved5;
__u64 reserved6; __u32 reserved6;
__u32 reserved7; __u8 reserved7;
__u8 reserved8;
__u8 revision_id; __u8 revision_id;
__u8 pad[2]; __u16 tpc_interrupt_id;
__u32 rotator_enabled_mask;
__u32 reserved9;
__u64 engine_core_interrupt_reg_addr;
__u64 reserved_dram_size;
}; };
struct hl_info_dram_usage { struct hl_info_dram_usage {
@ -1161,6 +1182,39 @@ struct hl_info_undefined_opcode_event {
__u32 stream_id; __u32 stream_id;
}; };
/**
* struct hl_info_hw_err_event - info about HW error
* @timestamp: timestamp of error occurrence
* @event_id: The async event ID (specific to each device type).
* @pad: size padding for u64 granularity.
*/
struct hl_info_hw_err_event {
__s64 timestamp;
__u16 event_id;
__u16 pad[3];
};
/* FW error definition for event_type in struct hl_info_fw_err_event */
enum hl_info_fw_err_type {
HL_INFO_FW_HEARTBEAT_ERR,
HL_INFO_FW_REPORTED_ERR,
};
/**
* struct hl_info_fw_err_event - info about FW error
* @timestamp: time-stamp of error occurrence
* @err_type: The type of event as defined in hl_info_fw_err_type.
* @event_id: The async event ID (specific to each device type, applicable only when event type is
* HL_INFO_FW_REPORTED_ERR).
* @pad: size padding for u64 granularity.
*/
struct hl_info_fw_err_event {
__s64 timestamp;
__u16 err_type;
__u16 event_id;
__u32 pad;
};
/** /**
* struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information. * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
* @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
@ -1486,17 +1540,31 @@ struct hl_cs_chunk {
*/ */
#define HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES 0x8000 #define HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES 0x8000
/*
* The engines CS is merged into the existing CS ioctls.
* Use it to control engines modes.
*/
#define HL_CS_FLAGS_ENGINES_COMMAND 0x10000
#define HL_CS_STATUS_SUCCESS 0 #define HL_CS_STATUS_SUCCESS 0
#define HL_MAX_JOBS_PER_CS 512 #define HL_MAX_JOBS_PER_CS 512
/* HL_ENGINE_CORE_ values /*
* enum hl_engine_command - engine command
* *
* HL_ENGINE_CORE_HALT: engine core halt * @HL_ENGINE_CORE_HALT: engine core halt
* HL_ENGINE_CORE_RUN: engine core run * @HL_ENGINE_CORE_RUN: engine core run
* @HL_ENGINE_STALL: user engine/s stall
* @HL_ENGINE_RESUME: user engine/s resume
*/ */
#define HL_ENGINE_CORE_HALT (1 << 0) enum hl_engine_command {
#define HL_ENGINE_CORE_RUN (1 << 1) HL_ENGINE_CORE_HALT = 1,
HL_ENGINE_CORE_RUN = 2,
HL_ENGINE_STALL = 3,
HL_ENGINE_RESUME = 4,
HL_ENGINE_COMMAND_MAX
};
struct hl_cs_in { struct hl_cs_in {
@ -1520,6 +1588,18 @@ struct hl_cs_in {
/* the core command to be sent towards engine cores */ /* the core command to be sent towards engine cores */
__u32 core_command; __u32 core_command;
}; };
/* Valid only when HL_CS_FLAGS_ENGINES_COMMAND is set */
struct {
/* this holds address of array of uint32 for engines */
__u64 engines;
/* number of engines in engines array */
__u32 num_engines;
/* the engine command to be sent towards engines */
__u32 engine_command;
};
}; };
union { union {