drm/amd/amdgpu: add missing mutex lock to amdgpu_get_xgmi_hive() (v3)

v2: Move locks around in other functions so that this
function can stand on its own.  Also only hold the hive
specific lock for add/remove device instead of the driver
global lock so you can't add/remove devices in parallel from
one hive.

v3: add reset_lock

Acked-by:  Shaoyun.liu < Shaoyun.liu@amd.com>
Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tom St Denis 2019-01-07 17:39:10 -05:00 committed by Alex Deucher
parent 4b9674e509
commit 22d6575b8d
3 changed files with 32 additions and 19 deletions

View File

@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* by different nodes. No point also since the one node already executing * by different nodes. No point also since the one node already executing
* reset will also reset all the other nodes in the hive. * reset will also reset all the other nodes in the hive.
*/ */
hive = amdgpu_get_xgmi_hive(adev); hive = amdgpu_get_xgmi_hive(adev, 0);
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
!mutex_trylock(&hive->hive_lock)) !mutex_trylock(&hive->reset_lock))
return 0; return 0;
/* Start with adev pre asic reset first for soft reset check.*/ /* Start with adev pre asic reset first for soft reset check.*/
@ -3606,7 +3606,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
} }
if (hive && adev->gmc.xgmi.num_physical_nodes > 1) if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
mutex_unlock(&hive->hive_lock); mutex_unlock(&hive->reset_lock);
if (r) if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r); dev_info(adev->dev, "GPU reset end with ret = %d\n", r);

View File

@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
return &hive->device_list; return &hive->device_list;
} }
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
{ {
int i; int i;
struct amdgpu_hive_info *tmp; struct amdgpu_hive_info *tmp;
if (!adev->gmc.xgmi.hive_id) if (!adev->gmc.xgmi.hive_id)
return NULL; return NULL;
mutex_lock(&xgmi_mutex);
for (i = 0 ; i < hive_count; ++i) { for (i = 0 ; i < hive_count; ++i) {
tmp = &xgmi_hives[i]; tmp = &xgmi_hives[i];
if (tmp->hive_id == adev->gmc.xgmi.hive_id) if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
if (lock)
mutex_lock(&tmp->hive_lock);
mutex_unlock(&xgmi_mutex);
return tmp; return tmp;
}
} }
if (i >= AMDGPU_MAX_XGMI_HIVE) if (i >= AMDGPU_MAX_XGMI_HIVE) {
mutex_unlock(&xgmi_mutex);
return NULL; return NULL;
}
/* initialize new hive if not exist */ /* initialize new hive if not exist */
tmp = &xgmi_hives[hive_count++]; tmp = &xgmi_hives[hive_count++];
tmp->hive_id = adev->gmc.xgmi.hive_id; tmp->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(&tmp->device_list); INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock); mutex_init(&tmp->hive_lock);
mutex_init(&tmp->reset_lock);
if (lock)
mutex_lock(&tmp->hive_lock);
mutex_unlock(&xgmi_mutex);
return tmp; return tmp;
} }
@ -111,8 +125,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return ret; return ret;
} }
mutex_lock(&xgmi_mutex); hive = amdgpu_get_xgmi_hive(adev, 1);
hive = amdgpu_get_xgmi_hive(adev);
if (!hive) { if (!hive) {
ret = -EINVAL; ret = -EINVAL;
dev_err(adev->dev, dev_err(adev->dev,
@ -147,8 +160,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
break; break;
} }
mutex_unlock(&hive->hive_lock);
exit: exit:
mutex_unlock(&xgmi_mutex);
return ret; return ret;
} }
@ -159,15 +172,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported) if (!adev->gmc.xgmi.supported)
return; return;
mutex_lock(&xgmi_mutex); hive = amdgpu_get_xgmi_hive(adev, 1);
hive = amdgpu_get_xgmi_hive(adev);
if (!hive) if (!hive)
goto exit; return;
if (!(hive->number_devices--)) if (!(hive->number_devices--)) {
mutex_destroy(&hive->hive_lock); mutex_destroy(&hive->hive_lock);
mutex_destroy(&hive->reset_lock);
exit: } else {
mutex_unlock(&xgmi_mutex); mutex_unlock(&hive->hive_lock);
}
} }

View File

@ -29,10 +29,11 @@ struct amdgpu_hive_info {
struct list_head device_list; struct list_head device_list;
struct psp_xgmi_topology_info topology_info; struct psp_xgmi_topology_info topology_info;
int number_devices; int number_devices;
struct mutex hive_lock; struct mutex hive_lock,
reset_lock;
}; };
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev); struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev); int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
int amdgpu_xgmi_add_device(struct amdgpu_device *adev); int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
void amdgpu_xgmi_remove_device(struct amdgpu_device *adev); void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);