Tweak GPU offload when skipping unused logits computations
This commit is contained in:
parent
5553820d90
commit
3be6e8d36f
1 changed files with 9 additions and 5 deletions
|
@ -2194,6 +2194,7 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||||
offload_func_t offload_func_kq = llama_nop;
|
offload_func_t offload_func_kq = llama_nop;
|
||||||
offload_func_t offload_func_v = llama_nop;
|
offload_func_t offload_func_v = llama_nop;
|
||||||
|
offload_func_t offload_func_skip = llama_nop;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
if (n_gpu_layers > n_layer) {
|
if (n_gpu_layers > n_layer) {
|
||||||
|
@ -2205,6 +2206,9 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
if (n_gpu_layers > n_layer + 2) {
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
||||||
}
|
}
|
||||||
|
if (n_gpu_layers > 0) {
|
||||||
|
offload_func_skip = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||||
|
@ -2288,11 +2292,11 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
// otherwise for Metal we'd have to rebuild the concurrency list.
|
// otherwise for Metal we'd have to rebuild the concurrency list.
|
||||||
|
|
||||||
cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
|
cur = ggml_view_2d(ctx0, cur, n_embd, 1, cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
|
||||||
offload_func_nr(cur);
|
offload_func_skip(cur);
|
||||||
ggml_set_name(cur, "cur-lastpos");
|
ggml_set_name(cur, "cur-lastpos");
|
||||||
|
|
||||||
inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
|
inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
|
||||||
offload_func_nr(inpSA);
|
offload_func_skip(inpSA);
|
||||||
ggml_set_name(inpSA, "inpSA-lastpos");
|
ggml_set_name(inpSA, "inpSA-lastpos");
|
||||||
|
|
||||||
n_past += N - 1;
|
n_past += N - 1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue