diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0565571f4..bc5504376 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2481,3 +2481,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func(tensor->src0, tensor->src1, tensor); return true; } + +bool ggml_cuda_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst) { + //TODO: Do we need support split + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + char * src_ptr = (char *) extra->data_device[g_main_device]; + + CUDA_CHECK(cudaMemcpy(dst, src_ptr + offset, size, cudaMemcpyDeviceToHost)); + + return true; +} \ No newline at end of file diff --git a/ggml-cuda.h b/ggml-cuda.h index d32b44842..a75bf21a7 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -34,6 +34,8 @@ void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_free_scratch(void); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +bool ggml_cuda_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst); + #ifdef __cplusplus } #endif diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 5df922abd..ee37b4053 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1193,3 +1193,8 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { tensor->data = dst; GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } + +bool ggml_cl_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst) { + CL_CHECK(clEnqueueReadBuffer(queue, tensor->data, true, offset, size, dst, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); +} diff --git a/ggml-opencl.h b/ggml-opencl.h index a92b445c9..3c0d9788b 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -20,6 +20,8 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor); void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); +bool ggml_cl_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst); + #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index b8bc0d821..879041346 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1740,7 +1740,25 @@ static bool llama_eval_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + switch(embeddings->backend) + { + case GGML_BACKEND_CPU: + memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + break; +#if defined(GGML_USE_CUBLAS) + case GGML_BACKEND_GPU: + case GGML_BACKEND_GPU_SPLIT: + ggml_cuda_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data()); + break; +#elif defined(GGML_USE_CLBAST) + case GGML_BACKEND_GPU: + case GGML_BACKEND_GPU_SPLIT: + ggml_cuda_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data()); + break; +#endif + } + + } if (mem_per_token == 0) {