Offload weights

This commit is contained in:
JohannesGaessler 2023-06-17 22:27:55 +02:00
parent f75125615a
commit 0c916d2357

View file

@ -1144,19 +1144,22 @@ static void falcon_model_load_internal(
ggml_backend backend_norm;
ggml_backend backend_output; ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) { // NOLINT if (n_gpu_layers > int(n_layer)) { // NOLINT
backend_norm = LLAMA_BACKEND_OFFLOAD;
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else { } else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
} }
// "output" tensor // "output" tensor
{ {
model.output_norm = ml->get_tensor("transformer.ln_f.weight", {n_embd}, GGML_BACKEND_CPU); model.output_norm = ml->get_tensor("transformer.ln_f.weight", {n_embd}, backend_norm);
model.output_norm_b = ml->get_tensor("transformer.ln_f.bias", {n_embd}, GGML_BACKEND_CPU); model.output_norm_b = ml->get_tensor("transformer.ln_f.bias", {n_embd}, backend_norm);
model.lm_head = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); model.lm_head = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, backend_output);
} }
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -1183,11 +1186,11 @@ static void falcon_model_load_internal(
layer.input_layernorm_b = ml->get_tensor("transformer.h." + str_i +".input_layernorm.bias", {n_embd}, backend); layer.input_layernorm_b = ml->get_tensor("transformer.h." + str_i +".input_layernorm.bias", {n_embd}, backend);
} }
layer.query_key_value = ml->get_tensor("transformer.h." + str_i +".self_attention.query_key_value.weight", {n_embd, (n_head_kv * 2 + n_head) * head_dim}, GGML_BACKEND_CPU); layer.query_key_value = ml->get_tensor("transformer.h." + str_i +".self_attention.query_key_value.weight", {n_embd, (n_head_kv * 2 + n_head) * head_dim}, backend_split);
layer.wo = ml->get_tensor("transformer.h." + str_i +".self_attention.dense.weight", {n_embd, n_embd}, GGML_BACKEND_CPU); layer.wo = ml->get_tensor("transformer.h." + str_i +".self_attention.dense.weight", {n_embd, n_embd}, backend_split);
layer.ffn_up = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_h_to_4h.weight", {n_embd, n_ff}, GGML_BACKEND_CPU); // before gelu layer.ffn_up = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_h_to_4h.weight", {n_embd, n_ff}, backend_split); // before gelu
layer.ffn_down = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_4h_to_h.weight", {n_ff, n_embd}, GGML_BACKEND_CPU); // after gelu layer.ffn_down = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_4h_to_h.weight", {n_ff, n_embd}, backend_split); // after gelu
if (backend == GGML_BACKEND_GPU) { if (backend == GGML_BACKEND_GPU) {
// llama: // llama: