OpenCL: Fix duplication of layers in VRAM and RAM, add GPU mul kernel (#1653)
* Use events instead of clFinish, where possible * OpenCL: Don't load gpu layers into RAM, add mul_f32 kernel * Reduce queueing overhead for contiguous tensors by using single mul kernel call * Adapt to #1612 cl_mem malloc changes * Reduce code duplication between cuda and opencl branches * Improve implementation
This commit is contained in:
parent
d8bd0013e8
commit
dcb2ed4826
4 changed files with 210 additions and 40 deletions
7
ggml.c
7
ggml.c
|
@ -8134,6 +8134,13 @@ static void ggml_compute_forward_mul_f32(
|
|||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (src1->backend == GGML_BACKEND_CL) {
|
||||
if (ith == 0) {
|
||||
ggml_cl_mul(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue