Tweak GPU offload when skipping unused logits computations
This commit is contained in:
parent
5553820d90
commit
3be6e8d36f
1 changed files with 9 additions and 5 deletions
14
llama.cpp
14
llama.cpp
|
@ -2191,9 +2191,10 @@ static struct ggml_cgraph * llm_build_llama(
|
|||
//
|
||||
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
||||
// in that case ggml_cuda_assign_buffers has no effect
|
||||
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||
offload_func_t offload_func_kq = llama_nop;
|
||||
offload_func_t offload_func_v = llama_nop;
|
||||
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||
offload_func_t offload_func_kq = llama_nop;
|
||||
offload_func_t offload_func_v = llama_nop;
|
||||
offload_func_t offload_func_skip = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (n_gpu_layers > n_layer) {
|
||||
|
@ -2205,6 +2206,9 @@ static struct ggml_cgraph * llm_build_llama(
|
|||
if (n_gpu_layers > n_layer + 2) {
|
||||
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
||||
}
|
||||
if (n_gpu_layers > 0) {
|
||||
offload_func_skip = ggml_cuda_assign_buffers_no_alloc;
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
|
@ -2288,11 +2292,11 @@ static struct ggml_cgraph * llm_build_llama(
|
|||
// otherwise for Metal we'd have to rebuild the concurrency list.
|
||||
|
||||
cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
|
||||
offload_func_nr(cur);
|
||||
offload_func_skip(cur);
|
||||
ggml_set_name(cur, "cur-lastpos");
|
||||
|
||||
inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
|
||||
offload_func_nr(inpSA);
|
||||
offload_func_skip(inpSA);
|
||||
ggml_set_name(inpSA, "inpSA-lastpos");
|
||||
|
||||
n_past += N - 1;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue