From ac61d7c25f2044b2b4cae5623d342cf3413a502f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Jul 2024 10:14:28 +0300 Subject: [PATCH] llama : use int for layer iterators [no ci] --- src/llama.cpp | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 88db609da..764f76b8f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5840,8 +5840,8 @@ static bool llm_load_tensors( model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - const int64_t n_layer = hparams.n_layer; - const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); + const int n_layer = hparams.n_layer; + const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); bool use_mmap_buffer = true; // there is very little benefit to offloading the input layer, so always keep it on the CPU @@ -5851,7 +5851,7 @@ static bool llm_load_tensors( model.buft_layer.resize(n_layer); // assign cpu layers - for (int64_t i = 0; i < i_gpu_start; ++i) { + for (int i = 0; i < i_gpu_start; ++i) { model.buft_layer[i] = llama_default_buffer_type_cpu(true); } @@ -5881,7 +5881,7 @@ static bool llm_load_tensors( // assign the repeating layers to the devices according to the splits int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); - for (int64_t i = i_gpu_start; i < n_layer; ++i) { + for (int i = i_gpu_start; i < n_layer; ++i) { int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu); } @@ -5901,7 +5901,7 @@ static bool llm_load_tensors( split_buft = llama_default_buffer_type_offload(model, main_gpu); } // assign the repeating layers - for (int64_t i = i_gpu_start; i < n_layer; ++i) { + for (int i = i_gpu_start; i < n_layer; ++i) { model.buft_layer[i] = { split_buft, llama_default_buffer_type_offload(model, main_gpu) @@ -5924,7 +5924,7 @@ static bool llm_load_tensors( buft_layer_count[model.buft_input.buft_matrix]++; buft_layer_count[model.buft_output.buft]++; buft_layer_count[model.buft_output.buft_matrix]++; - for (int64_t i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer; ++i) { buft_layer_count[model.buft_layer[i].buft]++; buft_layer_count[model.buft_layer[i].buft_matrix]++; } @@ -5954,6 +5954,7 @@ static bool llm_load_tensors( // create tensors for the weights { + // note: cast to int64_t since we will use these for the tensor dimensions const int64_t n_head = hparams.n_head(); const int64_t n_head_kv = hparams.n_head_kv(); const int64_t n_embd = hparams.n_embd; @@ -6680,8 +6681,8 @@ static bool llm_load_tensors( } for (int i = 0; i < n_layer; ++i) { - ggml_context* ctx_layer = ctx_for_layer(i); - ggml_context* ctx_split = ctx_for_layer_split(i); + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; @@ -6867,7 +6868,7 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading - for (uint32_t i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_split = ctx_for_layer_split(i); @@ -6894,7 +6895,7 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading - for (uint32_t i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_split = ctx_for_layer_split(i); @@ -7139,6 +7140,7 @@ static bool llm_load_tensors( case LLM_ARCH_GPTNEOX: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); @@ -7177,8 +7179,9 @@ static bool llm_load_tensors( // output { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed if (model.output == NULL) { model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); @@ -7232,7 +7235,7 @@ static bool llm_load_tensors( model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - for (uint32_t i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_split = ctx_for_layer_split(i); @@ -7242,17 +7245,19 @@ static bool llm_load_tensors( if (!is_lite) { layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}); } + layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}); if (!is_lite) { - layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); - layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}); + layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); + layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}); } else { - layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); } - layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}); - layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}); + + layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}); + layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});