Fix embedding when embedding layer on GPU
This commit is contained in:
parent
254a7a7a5f
commit
77ab0c0f3d
5 changed files with 39 additions and 1 deletions
11
ggml-cuda.cu
11
ggml-cuda.cu
|
@ -2481,3 +2481,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
func(tensor->src0, tensor->src1, tensor);
|
func(tensor->src0, tensor->src1, tensor);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_cuda_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst) {
|
||||||
|
//TODO: Do we need support split
|
||||||
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
|
char * src_ptr = (char *) extra->data_device[g_main_device];
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpy(dst, src_ptr + offset, size, cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
|
@ -34,6 +34,8 @@ void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||||
void ggml_cuda_free_scratch(void);
|
void ggml_cuda_free_scratch(void);
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
bool ggml_cuda_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1193,3 +1193,8 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||||
tensor->data = dst;
|
tensor->data = dst;
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_cl_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst) {
|
||||||
|
CL_CHECK(clEnqueueReadBuffer(queue, tensor->data, true, offset, size, dst, 0, NULL, NULL));
|
||||||
|
CL_CHECK(clFinish(queue));
|
||||||
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||||
|
|
||||||
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
bool ggml_cl_get_data(struct ggml_tensor * tensor, size_t offset, size_t size, void * dst);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
20
llama.cpp
20
llama.cpp
|
@ -1740,7 +1740,25 @@ static bool llama_eval_internal(
|
||||||
auto & embedding_out = lctx.embedding;
|
auto & embedding_out = lctx.embedding;
|
||||||
|
|
||||||
embedding_out.resize(n_embd);
|
embedding_out.resize(n_embd);
|
||||||
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
switch(embeddings->backend)
|
||||||
|
{
|
||||||
|
case GGML_BACKEND_CPU:
|
||||||
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
||||||
|
break;
|
||||||
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
case GGML_BACKEND_GPU:
|
||||||
|
case GGML_BACKEND_GPU_SPLIT:
|
||||||
|
ggml_cuda_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data());
|
||||||
|
break;
|
||||||
|
#elif defined(GGML_USE_CLBAST)
|
||||||
|
case GGML_BACKEND_GPU:
|
||||||
|
case GGML_BACKEND_GPU_SPLIT:
|
||||||
|
ggml_cuda_get_data(embeddings, (n_embd*(N - 1)) * sizeof(float), n_embd * sizeof(float), embedding_out.data());
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mem_per_token == 0) {
|
if (mem_per_token == 0) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue