cuda : fix vmm pool with multi GPU (#4620)
* cuda : fix vmm pool with multi GPU * hip * use recommended granularity instead of minimum * better error checking * fix mixtral * use cudaMemcpy3DPeerAsync * use cuda_pool_alloc in ggml_cuda_op_mul_mat * consolidate error checking in ggml_cuda_set_device * remove unnecessary inlines ggml-ci * style fixes * only use vmm for the main device * fix scratch buffer size, re-enable vmm pool for all devices * remove unnecessary check id != g_main_device
This commit is contained in:
parent
de8e496437
commit
dc68f0054c
3 changed files with 243 additions and 246 deletions
3
ggml.c
3
ggml.c
|
@ -4041,7 +4041,6 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|||
result->op = GGML_OP_GROUP_NORM;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -5541,7 +5540,6 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|||
result->op_params[0] = scale_factor;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = NULL;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -5846,7 +5844,6 @@ struct ggml_tensor * ggml_get_rel_pos(
|
|||
result->op = GGML_OP_GET_REL_POS;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = NULL;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue