diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index ee37b4053..fe0a5e678 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1195,6 +1195,8 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { } bool ggml_cl_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst) { - CL_CHECK(clEnqueueReadBuffer(queue, tensor->data, true, offset, size, dst, 0, NULL, NULL)); + CL_CHECK(clEnqueueReadBuffer(queue, (cl_mem)tensor->data, true, offset, size, dst, 0, NULL, NULL)); CL_CHECK(clFinish(queue)); + + return true; } diff --git a/llama.cpp b/llama.cpp index 879041346..a9e1510d5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1742,9 +1742,6 @@ static bool llama_eval_internal( embedding_out.resize(n_embd); switch(embeddings->backend) { - case GGML_BACKEND_CPU: - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); - break; #if defined(GGML_USE_CUBLAS) case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: @@ -1753,9 +1750,13 @@ static bool llama_eval_internal( #elif defined(GGML_USE_CLBAST) case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: - ggml_cuda_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data()); + ggml_cl_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data()); break; #endif + case GGML_BACKEND_CPU: + default: + memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + break; }