llama : use int for layer iterators [no ci]
This commit is contained in:
parent
dc91715b44
commit
ac61d7c25f
1 changed files with 24 additions and 19 deletions
|
@ -5840,8 +5840,8 @@ static bool llm_load_tensors(
|
|||
model.main_gpu = main_gpu;
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
bool use_mmap_buffer = true;
|
||||
|
||||
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
||||
|
@ -5851,7 +5851,7 @@ static bool llm_load_tensors(
|
|||
model.buft_layer.resize(n_layer);
|
||||
|
||||
// assign cpu layers
|
||||
for (int64_t i = 0; i < i_gpu_start; ++i) {
|
||||
for (int i = 0; i < i_gpu_start; ++i) {
|
||||
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
|
||||
|
@ -5881,7 +5881,7 @@ static bool llm_load_tensors(
|
|||
|
||||
// assign the repeating layers to the devices according to the splits
|
||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
for (int i = i_gpu_start; i < n_layer; ++i) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
||||
}
|
||||
|
@ -5901,7 +5901,7 @@ static bool llm_load_tensors(
|
|||
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
||||
}
|
||||
// assign the repeating layers
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
for (int i = i_gpu_start; i < n_layer; ++i) {
|
||||
model.buft_layer[i] = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(model, main_gpu)
|
||||
|
@ -5924,7 +5924,7 @@ static bool llm_load_tensors(
|
|||
buft_layer_count[model.buft_input.buft_matrix]++;
|
||||
buft_layer_count[model.buft_output.buft]++;
|
||||
buft_layer_count[model.buft_output.buft_matrix]++;
|
||||
for (int64_t i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
buft_layer_count[model.buft_layer[i].buft]++;
|
||||
buft_layer_count[model.buft_layer[i].buft_matrix]++;
|
||||
}
|
||||
|
@ -5954,6 +5954,7 @@ static bool llm_load_tensors(
|
|||
|
||||
// create tensors for the weights
|
||||
{
|
||||
// note: cast to int64_t since we will use these for the tensor dimensions
|
||||
const int64_t n_head = hparams.n_head();
|
||||
const int64_t n_head_kv = hparams.n_head_kv();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
@ -6680,8 +6681,8 @@ static bool llm_load_tensors(
|
|||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context* ctx_layer = ctx_for_layer(i);
|
||||
ggml_context* ctx_split = ctx_for_layer_split(i);
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
|
@ -6867,7 +6868,7 @@ static bool llm_load_tensors(
|
|||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
|
@ -6894,7 +6895,7 @@ static bool llm_load_tensors(
|
|||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
|
@ -7139,6 +7140,7 @@ static bool llm_load_tensors(
|
|||
case LLM_ARCH_GPTNEOX:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
|
@ -7177,8 +7179,9 @@ static bool llm_load_tensors(
|
|||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (model.output == NULL) {
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
|
@ -7232,7 +7235,7 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
|
@ -7242,17 +7245,19 @@ static bool llm_load_tensors(
|
|||
if (!is_lite) {
|
||||
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
||||
}
|
||||
|
||||
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
||||
|
||||
if (!is_lite) {
|
||||
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
||||
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
|
||||
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
||||
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
|
||||
} else {
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
}
|
||||
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
|
||||
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
|
||||
|
||||
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
|
||||
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue