llama : use int for layer iterators [no ci]

This commit is contained in:
Georgi Gerganov 2024-07-05 10:14:28 +03:00
parent dc91715b44
commit ac61d7c25f
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -5840,8 +5840,8 @@ static bool llm_load_tensors(
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;
const int64_t n_layer = hparams.n_layer;
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
const int n_layer = hparams.n_layer;
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
bool use_mmap_buffer = true;
// there is very little benefit to offloading the input layer, so always keep it on the CPU
@ -5851,7 +5851,7 @@ static bool llm_load_tensors(
model.buft_layer.resize(n_layer);
// assign cpu layers
for (int64_t i = 0; i < i_gpu_start; ++i) {
for (int i = 0; i < i_gpu_start; ++i) {
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
}
@ -5881,7 +5881,7 @@ static bool llm_load_tensors(
// assign the repeating layers to the devices according to the splits
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
for (int i = i_gpu_start; i < n_layer; ++i) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
}
@ -5901,7 +5901,7 @@ static bool llm_load_tensors(
split_buft = llama_default_buffer_type_offload(model, main_gpu);
}
// assign the repeating layers
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
for (int i = i_gpu_start; i < n_layer; ++i) {
model.buft_layer[i] = {
split_buft,
llama_default_buffer_type_offload(model, main_gpu)
@ -5924,7 +5924,7 @@ static bool llm_load_tensors(
buft_layer_count[model.buft_input.buft_matrix]++;
buft_layer_count[model.buft_output.buft]++;
buft_layer_count[model.buft_output.buft_matrix]++;
for (int64_t i = 0; i < n_layer; ++i) {
for (int i = 0; i < n_layer; ++i) {
buft_layer_count[model.buft_layer[i].buft]++;
buft_layer_count[model.buft_layer[i].buft_matrix]++;
}
@ -5954,6 +5954,7 @@ static bool llm_load_tensors(
// create tensors for the weights
{
// note: cast to int64_t since we will use these for the tensor dimensions
const int64_t n_head = hparams.n_head();
const int64_t n_head_kv = hparams.n_head_kv();
const int64_t n_embd = hparams.n_embd;
@ -6680,8 +6681,8 @@ static bool llm_load_tensors(
}
for (int i = 0; i < n_layer; ++i) {
ggml_context* ctx_layer = ctx_for_layer(i);
ggml_context* ctx_split = ctx_for_layer_split(i);
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
@ -6867,7 +6868,7 @@ static bool llm_load_tensors(
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
for (uint32_t i = 0; i < n_layer; ++i) {
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
@ -6894,7 +6895,7 @@ static bool llm_load_tensors(
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
for (uint32_t i = 0; i < n_layer; ++i) {
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
@ -7139,6 +7140,7 @@ static bool llm_load_tensors(
case LLM_ARCH_GPTNEOX:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
@ -7177,8 +7179,9 @@ static bool llm_load_tensors(
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
@ -7232,7 +7235,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
for (uint32_t i = 0; i < n_layer; ++i) {
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
@ -7242,17 +7245,19 @@ static bool llm_load_tensors(
if (!is_lite) {
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
}
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
if (!is_lite) {
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
} else {
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
}
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});