llama : code style fixes + progress print fix
This commit is contained in:
parent
ffe9652bc1
commit
f67bc3c363
2 changed files with 41 additions and 34 deletions
31
.gitignore
vendored
31
.gitignore
vendored
|
@ -1,5 +1,15 @@
|
||||||
*.o
|
/*.o
|
||||||
*.a
|
/*.a
|
||||||
|
/*.sh
|
||||||
|
/*.log
|
||||||
|
/*.org
|
||||||
|
|
||||||
|
/ppl-*.txt
|
||||||
|
/qnt-*.txt
|
||||||
|
/perf-*.txt
|
||||||
|
|
||||||
|
*.bin
|
||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.build/
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
@ -21,8 +31,9 @@ build-no-accel/
|
||||||
build-sanitize-addr/
|
build-sanitize-addr/
|
||||||
build-sanitize-thread/
|
build-sanitize-thread/
|
||||||
|
|
||||||
|
prompts/
|
||||||
models/*
|
models/*
|
||||||
*.bin
|
wikitext-2-raw/
|
||||||
|
|
||||||
/main
|
/main
|
||||||
/quantize
|
/quantize
|
||||||
|
@ -33,6 +44,7 @@ models/*
|
||||||
/benchmark-matmult
|
/benchmark-matmult
|
||||||
/vdot
|
/vdot
|
||||||
/Pipfile
|
/Pipfile
|
||||||
|
/libllama.so
|
||||||
|
|
||||||
build-info.h
|
build-info.h
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
@ -43,17 +55,4 @@ __pycache__
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
ppl-*.txt
|
|
||||||
qnt-*.txt
|
|
||||||
perf-*.txt
|
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
|
|
||||||
/prompts
|
|
||||||
*.sh
|
|
||||||
*.log
|
|
||||||
*.py
|
|
||||||
*.txt
|
|
||||||
/wikitext-2-raw/
|
|
||||||
*.org
|
|
||||||
/libllama.so
|
|
||||||
|
|
24
llama.cpp
24
llama.cpp
|
@ -1011,20 +1011,27 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
||||||
|
|
||||||
|
// "output" tensor
|
||||||
|
{
|
||||||
ggml_backend backend_output;
|
ggml_backend backend_output;
|
||||||
if (n_gpu_layers > int(n_layer)) {
|
if (n_gpu_layers > int(n_layer)) {
|
||||||
backend_output = GGML_BACKEND_CUDA;
|
backend_output = GGML_BACKEND_CUDA;
|
||||||
} else {
|
} else {
|
||||||
backend_output = GGML_BACKEND_CPU;
|
backend_output = GGML_BACKEND_CPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = model.layers[i];
|
|
||||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_CUDA;
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_CUDA;
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
std::string layers_i = "layers." + std::to_string(i);
|
std::string layers_i = "layers." + std::to_string(i);
|
||||||
|
|
||||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
||||||
|
@ -1039,10 +1046,12 @@ static void llama_model_load_internal(
|
||||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
||||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
||||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_CUDA) {
|
if (backend == GGML_BACKEND_CUDA) {
|
||||||
vram_total += ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)
|
vram_total +=
|
||||||
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm)
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
||||||
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2192,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
||||||
unsigned * cur_percentage_p = (unsigned *) ctx;
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
||||||
unsigned percentage = (unsigned) (100 * progress);
|
unsigned percentage = (unsigned) (100 * progress);
|
||||||
while (percentage > *cur_percentage_p) {
|
while (percentage > *cur_percentage_p) {
|
||||||
++*cur_percentage_p;
|
*cur_percentage_p = percentage;
|
||||||
fprintf(stderr, ".");
|
fprintf(stderr, ".");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
if (percentage >= 100) {
|
if (percentage >= 100) {
|
||||||
|
@ -2442,8 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
}
|
}
|
||||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||||
base_t = model_loader->get_tensor(
|
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||||
base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
|
||||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||||
model_loader->load_data_for(lt);
|
model_loader->load_data_for(lt);
|
||||||
lt.ggml_tensor->data = lt.data;
|
lt.ggml_tensor->data = lt.data;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue