drm/amd/amdgpu: add missing mutex lock to amdgpu_get_xgmi_hive() (v3)

v2: Move locks around in other functions so that this
function can stand on its own.  Also only hold the hive
specific lock for add/remove device instead of the driver
global lock so you can't add/remove devices in parallel from
one hive.

v3: add reset_lock

Acked-by:  Shaoyun.liu < Shaoyun.liu@amd.com>
Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tom St Denis 2019-01-07 17:39:10 -05:00 committed by Alex Deucher
parent 4b9674e509
commit 22d6575b8d
3 changed files with 32 additions and 19 deletions

View File

@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* by different nodes. No point also since the one node already executing
* reset will also reset all the other nodes in the hive.
*/
hive = amdgpu_get_xgmi_hive(adev);
hive = amdgpu_get_xgmi_hive(adev, 0);
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
!mutex_trylock(&hive->hive_lock))
!mutex_trylock(&hive->reset_lock))
return 0;
/* Start with adev pre asic reset first for soft reset check.*/
@ -3606,7 +3606,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
}
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
mutex_unlock(&hive->hive_lock);
mutex_unlock(&hive->reset_lock);
if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);

View File

@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
return &hive->device_list;
}
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
{
int i;
struct amdgpu_hive_info *tmp;
if (!adev->gmc.xgmi.hive_id)
return NULL;
mutex_lock(&xgmi_mutex);
for (i = 0 ; i < hive_count; ++i) {
tmp = &xgmi_hives[i];
if (tmp->hive_id == adev->gmc.xgmi.hive_id)
if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
if (lock)
mutex_lock(&tmp->hive_lock);
mutex_unlock(&xgmi_mutex);
return tmp;
}
}
if (i >= AMDGPU_MAX_XGMI_HIVE)
if (i >= AMDGPU_MAX_XGMI_HIVE) {
mutex_unlock(&xgmi_mutex);
return NULL;
}
/* initialize new hive if not exist */
tmp = &xgmi_hives[hive_count++];
tmp->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock);
mutex_init(&tmp->reset_lock);
if (lock)
mutex_lock(&tmp->hive_lock);
mutex_unlock(&xgmi_mutex);
return tmp;
}
@ -111,8 +125,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return ret;
}
mutex_lock(&xgmi_mutex);
hive = amdgpu_get_xgmi_hive(adev);
hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive) {
ret = -EINVAL;
dev_err(adev->dev,
@ -147,8 +160,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
break;
}
mutex_unlock(&hive->hive_lock);
exit:
mutex_unlock(&xgmi_mutex);
return ret;
}
@ -159,15 +172,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported)
return;
mutex_lock(&xgmi_mutex);
hive = amdgpu_get_xgmi_hive(adev);
hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive)
goto exit;
return;
if (!(hive->number_devices--))
if (!(hive->number_devices--)) {
mutex_destroy(&hive->hive_lock);
exit:
mutex_unlock(&xgmi_mutex);
mutex_destroy(&hive->reset_lock);
} else {
mutex_unlock(&hive->hive_lock);
}
}

View File

@ -29,10 +29,11 @@ struct amdgpu_hive_info {
struct list_head device_list;
struct psp_xgmi_topology_info topology_info;
int number_devices;
struct mutex hive_lock;
struct mutex hive_lock,
reset_lock;
};
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);