check existence of qkvo bias while loading llama models

Tested on LLaMA2, CUDA and CPU.
This commit is contained in:
CausalLM 2023-12-02 00:56:48 +08:00 committed by GitHub
parent c48679a8e8
commit e192572d21
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -2785,10 +2785,29 @@ static void llm_load_tensors(
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
try {
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
} catch (const std::runtime_error& e) {
if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw;
}
try {
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
} catch (const std::runtime_error& e) {
if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw;
}
try {
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
} catch (const std::runtime_error& e) {
if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw;
}
try {
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
} catch (const std::runtime_error& e) {
if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw;
}
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
@ -2799,9 +2818,13 @@ static void llm_load_tensors(
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bq) +
ggml_nbytes(layer.bk) + ggml_nbytes(layer.bv) + ggml_nbytes(layer.bo) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
}
}
} break;