mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-15 15:15:47 +00:00
accel/habanalabs: set device status 'malfunction' while in rmmod
hl_device_status() returns the status of an acquired device. If a device is going down (following an rmmod cmd), it should be marked as an unusable/malfunctioning device, and hence should not be acquired. However, since this was not the case so far (i.e., a device going down would inaccurately return 'in reset' status allowing the user to acquire the device) it introduced a bug where as part of a reset flow, the driver could not kill processes that have not run yet, and since those processes aren't blocked from reacquiring a device, we get eventually a new flow of a driver attempting to kill all processes in a list that can't be ever really empty. Signed-off-by: Koby Elbaz <kelbaz@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
e7b2902a33
commit
e4a97d6b62
1 changed files with 4 additions and 2 deletions
|
@ -315,7 +315,9 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
|
||||||
{
|
{
|
||||||
enum hl_device_status status;
|
enum hl_device_status status;
|
||||||
|
|
||||||
if (hdev->reset_info.in_reset) {
|
if (hdev->device_fini_pending) {
|
||||||
|
status = HL_DEVICE_STATUS_MALFUNCTION;
|
||||||
|
} else if (hdev->reset_info.in_reset) {
|
||||||
if (hdev->reset_info.in_compute_reset)
|
if (hdev->reset_info.in_compute_reset)
|
||||||
status = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE;
|
status = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE;
|
||||||
else
|
else
|
||||||
|
@ -343,9 +345,9 @@ bool hl_device_operational(struct hl_device *hdev,
|
||||||
*status = current_status;
|
*status = current_status;
|
||||||
|
|
||||||
switch (current_status) {
|
switch (current_status) {
|
||||||
|
case HL_DEVICE_STATUS_MALFUNCTION:
|
||||||
case HL_DEVICE_STATUS_IN_RESET:
|
case HL_DEVICE_STATUS_IN_RESET:
|
||||||
case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE:
|
case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE:
|
||||||
case HL_DEVICE_STATUS_MALFUNCTION:
|
|
||||||
case HL_DEVICE_STATUS_NEEDS_RESET:
|
case HL_DEVICE_STATUS_NEEDS_RESET:
|
||||||
return false;
|
return false;
|
||||||
case HL_DEVICE_STATUS_OPERATIONAL:
|
case HL_DEVICE_STATUS_OPERATIONAL:
|
||||||
|
|
Loading…
Reference in a new issue