mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-01 06:33:07 +00:00
drm/amd/amdgpu: add missing mutex lock to amdgpu_get_xgmi_hive() (v3)
v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. v3: add reset_lock Acked-by: Shaoyun.liu < Shaoyun.liu@amd.com> Signed-off-by: Tom St Denis <tom.stdenis@amd.com> Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
4b9674e509
commit
22d6575b8d
3 changed files with 32 additions and 19 deletions
|
@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||||
* by different nodes. No point also since the one node already executing
|
* by different nodes. No point also since the one node already executing
|
||||||
* reset will also reset all the other nodes in the hive.
|
* reset will also reset all the other nodes in the hive.
|
||||||
*/
|
*/
|
||||||
hive = amdgpu_get_xgmi_hive(adev);
|
hive = amdgpu_get_xgmi_hive(adev, 0);
|
||||||
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
|
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
|
||||||
!mutex_trylock(&hive->hive_lock))
|
!mutex_trylock(&hive->reset_lock))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/* Start with adev pre asic reset first for soft reset check.*/
|
/* Start with adev pre asic reset first for soft reset check.*/
|
||||||
|
@ -3606,7 +3606,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
|
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
|
||||||
mutex_unlock(&hive->hive_lock);
|
mutex_unlock(&hive->reset_lock);
|
||||||
|
|
||||||
if (r)
|
if (r)
|
||||||
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
|
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
|
||||||
|
|
|
@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
|
||||||
return &hive->device_list;
|
return &hive->device_list;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
|
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
struct amdgpu_hive_info *tmp;
|
struct amdgpu_hive_info *tmp;
|
||||||
|
|
||||||
if (!adev->gmc.xgmi.hive_id)
|
if (!adev->gmc.xgmi.hive_id)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
mutex_lock(&xgmi_mutex);
|
||||||
|
|
||||||
for (i = 0 ; i < hive_count; ++i) {
|
for (i = 0 ; i < hive_count; ++i) {
|
||||||
tmp = &xgmi_hives[i];
|
tmp = &xgmi_hives[i];
|
||||||
if (tmp->hive_id == adev->gmc.xgmi.hive_id)
|
if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
|
||||||
|
if (lock)
|
||||||
|
mutex_lock(&tmp->hive_lock);
|
||||||
|
mutex_unlock(&xgmi_mutex);
|
||||||
return tmp;
|
return tmp;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (i >= AMDGPU_MAX_XGMI_HIVE)
|
if (i >= AMDGPU_MAX_XGMI_HIVE) {
|
||||||
|
mutex_unlock(&xgmi_mutex);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* initialize new hive if not exist */
|
/* initialize new hive if not exist */
|
||||||
tmp = &xgmi_hives[hive_count++];
|
tmp = &xgmi_hives[hive_count++];
|
||||||
tmp->hive_id = adev->gmc.xgmi.hive_id;
|
tmp->hive_id = adev->gmc.xgmi.hive_id;
|
||||||
INIT_LIST_HEAD(&tmp->device_list);
|
INIT_LIST_HEAD(&tmp->device_list);
|
||||||
mutex_init(&tmp->hive_lock);
|
mutex_init(&tmp->hive_lock);
|
||||||
|
mutex_init(&tmp->reset_lock);
|
||||||
|
if (lock)
|
||||||
|
mutex_lock(&tmp->hive_lock);
|
||||||
|
|
||||||
|
mutex_unlock(&xgmi_mutex);
|
||||||
|
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
@ -111,8 +125,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_lock(&xgmi_mutex);
|
hive = amdgpu_get_xgmi_hive(adev, 1);
|
||||||
hive = amdgpu_get_xgmi_hive(adev);
|
|
||||||
if (!hive) {
|
if (!hive) {
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
dev_err(adev->dev,
|
dev_err(adev->dev,
|
||||||
|
@ -147,8 +160,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mutex_unlock(&hive->hive_lock);
|
||||||
exit:
|
exit:
|
||||||
mutex_unlock(&xgmi_mutex);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,15 +172,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
|
||||||
if (!adev->gmc.xgmi.supported)
|
if (!adev->gmc.xgmi.supported)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
mutex_lock(&xgmi_mutex);
|
hive = amdgpu_get_xgmi_hive(adev, 1);
|
||||||
|
|
||||||
hive = amdgpu_get_xgmi_hive(adev);
|
|
||||||
if (!hive)
|
if (!hive)
|
||||||
goto exit;
|
return;
|
||||||
|
|
||||||
if (!(hive->number_devices--))
|
if (!(hive->number_devices--)) {
|
||||||
mutex_destroy(&hive->hive_lock);
|
mutex_destroy(&hive->hive_lock);
|
||||||
|
mutex_destroy(&hive->reset_lock);
|
||||||
exit:
|
} else {
|
||||||
mutex_unlock(&xgmi_mutex);
|
mutex_unlock(&hive->hive_lock);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,10 +29,11 @@ struct amdgpu_hive_info {
|
||||||
struct list_head device_list;
|
struct list_head device_list;
|
||||||
struct psp_xgmi_topology_info topology_info;
|
struct psp_xgmi_topology_info topology_info;
|
||||||
int number_devices;
|
int number_devices;
|
||||||
struct mutex hive_lock;
|
struct mutex hive_lock,
|
||||||
|
reset_lock;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
|
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
|
||||||
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
|
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
|
||||||
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
|
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
|
||||||
void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
|
void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
|
||||||
|
|
Loading…
Reference in a new issue