llama : code style fixes + progress print fix

This commit is contained in:
Georgi Gerganov 2023-05-20 12:29:08 +03:00
parent ffe9652bc1
commit f67bc3c363
2 changed files with 41 additions and 34 deletions

31
.gitignore vendored
View file

@ -1,5 +1,15 @@
*.o /*.o
*.a /*.a
/*.sh
/*.log
/*.org
/ppl-*.txt
/qnt-*.txt
/perf-*.txt
*.bin
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/
@ -21,8 +31,9 @@ build-no-accel/
build-sanitize-addr/ build-sanitize-addr/
build-sanitize-thread/ build-sanitize-thread/
prompts/
models/* models/*
*.bin wikitext-2-raw/
/main /main
/quantize /quantize
@ -33,6 +44,7 @@ models/*
/benchmark-matmult /benchmark-matmult
/vdot /vdot
/Pipfile /Pipfile
/libllama.so
build-info.h build-info.h
arm_neon.h arm_neon.h
@ -43,17 +55,4 @@ __pycache__
zig-out/ zig-out/
zig-cache/ zig-cache/
ppl-*.txt
qnt-*.txt
perf-*.txt
examples/jeopardy/results.txt examples/jeopardy/results.txt
/prompts
*.sh
*.log
*.py
*.txt
/wikitext-2-raw/
*.org
/libllama.so

View file

@ -1011,20 +1011,27 @@ static void llama_model_load_internal(
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU); model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
// "output" tensor
{
ggml_backend backend_output; ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) { if (n_gpu_layers > int(n_layer)) {
backend_output = GGML_BACKEND_CUDA; backend_output = GGML_BACKEND_CUDA;
} else { } else {
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
} }
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
}
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer); model.layers.resize(n_layer);
const int i_gpu_start = n_layer - n_gpu_layers;
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_CUDA; const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_CUDA;
auto & layer = model.layers[i];
std::string layers_i = "layers." + std::to_string(i); std::string layers_i = "layers." + std::to_string(i);
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
@ -1039,10 +1046,12 @@ static void llama_model_load_internal(
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend); layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend); layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend); layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
if (backend == GGML_BACKEND_CUDA) { if (backend == GGML_BACKEND_CUDA) {
vram_total += ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) vram_total +=
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
} }
} }
} }
@ -2192,7 +2201,7 @@ struct llama_context * llama_init_from_file(
unsigned * cur_percentage_p = (unsigned *) ctx; unsigned * cur_percentage_p = (unsigned *) ctx;
unsigned percentage = (unsigned) (100 * progress); unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) { while (percentage > *cur_percentage_p) {
++*cur_percentage_p; *cur_percentage_p = percentage;
fprintf(stderr, "."); fprintf(stderr, ".");
fflush(stderr); fflush(stderr);
if (percentage >= 100) { if (percentage >= 100) {
@ -2442,8 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
} }
size_t idx = model_loader->tensors_map.name_to_idx[base_name]; size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
base_t = model_loader->get_tensor( base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data; lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt); model_loader->load_data_for(lt);
lt.ggml_tensor->data = lt.data; lt.ggml_tensor->data = lt.data;