From d7b7484f74d486f77feb4c0b7af7e1718ed91651 Mon Sep 17 00:00:00 2001
From: eiery <19350831+eiery@users.noreply.github.com>
Date: Fri, 23 Jun 2023 04:38:01 -0400
Subject: [PATCH 01/12] Add OpenLLaMA instructions to the README (#1954)
* add openllama to readme
---
README.md | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/README.md b/README.md
index ace588606..b09498be6 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
Quantization
Interactive mode
Instruction mode with Alpaca
+ Using OpenLLaMA
Using GPT4All
Using Pygmalion 7B & Metharme 7B
Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
@@ -543,6 +544,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```
+### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
+
+OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
+
+- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
+- Convert the model to ggml FP16 format using `python convert.py `
+
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
From 527b6fba1d237befb324fd846bda7418c0fa394d Mon Sep 17 00:00:00 2001
From: Didzis Gosko
Date: Sat, 24 Jun 2023 11:47:58 +0300
Subject: [PATCH 02/12] llama : make model stateless and context stateful
(llama_state) (#1797)
* llama : make model stateless and context stateful
* llama : minor cleanup
* llama : update internal API declaration
* Apply suggestions from code review
fix style
Co-authored-by: Georgi Gerganov
* Missing model memory release
* Fix style
* Add deprecated warning for public API function llama_init_from_file
* Update public API use cases: move away from deprecated llama_init_from_file
* Deprecate public API function llama_apply_lora_from_file
---------
Co-authored-by: Georgi Gerganov
---
examples/common.cpp | 24 ++-
examples/common.h | 3 +-
examples/embedding/embedding.cpp | 6 +-
examples/main/main.cpp | 8 +-
examples/perplexity/perplexity.cpp | 6 +-
examples/quantize-stats/quantize-stats.cpp | 15 +-
examples/save-load-state/save-load-state.cpp | 29 ++-
examples/server/server.cpp | 9 +-
examples/simple/simple.cpp | 8 +-
.../train-text-from-scratch.cpp | 5 +-
llama.cpp | 172 ++++++++++++------
llama.h | 35 +++-
tests/test-tokenizer-0.cpp | 16 +-
13 files changed, 244 insertions(+), 92 deletions(-)
diff --git a/examples/common.cpp b/examples/common.cpp
index fed24e027..6ac484555 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -536,7 +536,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s
return res;
}
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
@@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
- llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
-
- if (lctx == NULL) {
+ llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
+ if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
- return NULL;
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ llama_context * lctx = llama_new_context_with_model(model, lparams);
+ if (lctx == NULL) {
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
}
if (!params.lora_adapter.empty()) {
- int err = llama_apply_lora_from_file(lctx,
+ int err = llama_model_apply_lora_from_file(model,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
- return NULL;
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
}
}
- return lctx;
+ return std::make_tuple(model, lctx);
}
void console_init(console_state & con_st) {
diff --git a/examples/common.h b/examples/common.h
index 6c2953cb2..713320179 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -9,6 +9,7 @@
#include
#include
#include
+#include
#if !defined (_WIN32)
#include
@@ -95,7 +96,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s
// Model utils
//
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
+std::tuple llama_init_from_gpt_params(const gpt_params & params);
//
// Console utils
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 860f99f67..369eac1d1 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
llama_init_backend();
+ llama_model * model;
llama_context * ctx;
// load the model
- ctx = llama_init_from_gpt_params(params);
- if (ctx == NULL) {
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
+ llama_free_model(model);
return 0;
}
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 941312f9c..c1e6bf126 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
llama_init_backend();
+ llama_model * model;
llama_context * ctx;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
- ctx = llama_init_from_gpt_params(params);
- if (ctx == NULL) {
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
+ llama_free_model(model);
return 0;
}
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml");
llama_free(ctx);
+ llama_free_model(model);
return 0;
}
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
+ llama_free_model(model);
return 0;
}
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index ae8cfe0af..b59f5971e 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
llama_init_backend();
+ llama_model * model;
llama_context * ctx;
// load the model and apply lora adapter, if any
- ctx = llama_init_from_gpt_params(params);
- if (ctx == NULL) {
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
+ llama_free_model(model);
return 0;
}
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 6b8018ee2..9cea472de 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "Loading model\n");
const int64_t t_main_start_us = ggml_time_us();
+ llama_model * model;
llama_context * ctx;
{
@@ -330,10 +331,18 @@ int main(int argc, char ** argv) {
lparams.f16_kv = false;
lparams.use_mlock = false;
- ctx = llama_init_from_file(params.model.c_str(), lparams);
+ model = llama_load_model_from_file(params.model.c_str(), lparams);
+
+ if (model == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+
+ ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) {
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ llama_free_model(model);
return 1;
}
}
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
llama_free(ctx);
+ llama_free_model(model);
return 1;
}
included_layers++;
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
+ llama_free_model(model);
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index da4d37ad0..4c8688503 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
auto last_n_tokens_data = std::vector(params.repeat_last_n, 0);
// init
- auto ctx = llama_init_from_file(params.model.c_str(), lparams);
+ auto model = llama_load_model_from_file(params.model.c_str(), lparams);
+ if (model == nullptr) {
+ return 1;
+ }
+ auto ctx = llama_new_context_with_model(model, lparams);
+ if (ctx == nullptr) {
+ llama_free_model(model);
+ return 1;
+ }
auto tokens = std::vector(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
+ llama_free(ctx);
+ llama_free_model(model);
return 1;
}
@@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
printf("%s", next_token_str);
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+ llama_free(ctx);
+ llama_free_model(model);
return 1;
}
n_past += 1;
@@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
printf("\n\n");
- // free old model
+ // free old context
llama_free(ctx);
- // load new model
- auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
+ // make new context
+ auto ctx2 = llama_new_context_with_model(model, lparams);
// Load state (rng, logits, embedding and kv_cache) from file
{
FILE *fp_read = fopen("dump_state.bin", "rb");
if (state_size != llama_get_state_size(ctx2)) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+ llama_free(ctx2);
+ llama_free_model(model);
return 1;
}
const size_t ret = fread(state_mem, 1, state_size, fp_read);
if (ret != state_size) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
+ llama_free(ctx2);
+ llama_free_model(model);
return 1;
}
@@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
printf("%s", next_token_str);
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+ llama_free(ctx2);
+ llama_free_model(model);
return 1;
}
n_past += 1;
@@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
printf("\n\n");
+ llama_free(ctx2);
+ llama_free_model(model);
+
return 0;
}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c0984aadb..de22d3013 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -115,6 +115,7 @@ struct llama_server_context {
std::vector embd;
std::vector last_n_tokens;
+ llama_model * model = nullptr;
llama_context * ctx = nullptr;
gpt_params params;
@@ -130,6 +131,10 @@ struct llama_server_context {
llama_free(ctx);
ctx = nullptr;
}
+ if (model) {
+ llama_free_model(model);
+ model = nullptr;
+ }
}
void rewind() {
@@ -150,8 +155,8 @@ struct llama_server_context {
bool loadModel(const gpt_params & params_) {
params = params_;
- ctx = llama_init_from_gpt_params(params);
- if (ctx == nullptr) {
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == nullptr) {
LOG_ERROR("unable to load model", { { "model", params_.model } });
return false;
}
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 76f991cdc..fc45c9340 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -68,11 +68,12 @@ int main(int argc, char ** argv)
llama_init_backend();
- llama_context * ctx ;
+ llama_model * model;
+ llama_context * ctx;
- ctx = llama_init_from_gpt_params( params );
+ std::tie(model, ctx) = llama_init_from_gpt_params( params );
- if ( ctx == NULL )
+ if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
@@ -170,6 +171,7 @@ int main(int argc, char ** argv)
} // wend of main loop
llama_free( ctx );
+ llama_free_model( model );
return 0;
}
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7ec85951a..61c829e5c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
struct llama_context_params llama_params = llama_context_default_params();
llama_params.vocab_only = true;
- struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
+ struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+ struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
struct llama_vocab vocab;
{
@@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
delete[] compute_addr;
delete[] compute_buf_0;
delete[] compute_buf_1;
+ llama_free(lctx);
+ llama_free_model(lmodel);
ggml_free(model.ctx);
return 0;
diff --git a/llama.cpp b/llama.cpp
index e597f5048..a528eef4a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -182,6 +182,19 @@ struct llama_kv_cache {
}
};
+struct llama_vocab {
+ using id = int32_t;
+ using token = std::string;
+
+ struct token_score {
+ token tok;
+ float score;
+ };
+
+ std::unordered_map token_to_id;
+ std::vector id_to_token;
+};
+
struct llama_model {
e_model type = MODEL_UNKNOWN;
@@ -198,10 +211,6 @@ struct llama_model {
// context
struct ggml_context * ctx = NULL;
- // key + value cache for the self attention
- // TODO: move to llama_state
- struct llama_kv_cache kv_self;
-
// the model memory buffer
llama_ctx_buffer buf;
@@ -215,6 +224,11 @@ struct llama_model {
// for quantize-stats only
std::vector> tensors_by_name;
+ int64_t t_load_us = 0;
+ int64_t t_start_us = 0;
+
+ llama_vocab vocab;
+
~llama_model() {
if (ctx) {
ggml_free(ctx);
@@ -233,24 +247,11 @@ struct llama_model {
}
};
-struct llama_vocab {
- using id = int32_t;
- using token = std::string;
-
- struct token_score {
- token tok;
- float score;
- };
-
- std::unordered_map token_to_id;
- std::vector id_to_token;
-};
-
struct llama_context {
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
+
std::mt19937 rng;
- int64_t t_load_us = 0;
- int64_t t_start_us = 0;
bool has_evaluated_once = false;
int64_t t_sample_us = 0;
@@ -261,8 +262,16 @@ struct llama_context {
int32_t n_eval = 0; // number of eval calls
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
- llama_model model;
- llama_vocab vocab;
+ const llama_model & model;
+ const llama_vocab & vocab;
+
+ bool model_owner = false;
+
+ int64_t t_load_us;
+ int64_t t_start_us;
+
+ // key + value cache for the self attention
+ struct llama_kv_cache kv_self;
size_t mem_per_token = 0;
@@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) {
static void llama_model_load_internal(
const std::string & fname,
- llama_context & lctx,
+ llama_model & model,
+ llama_vocab & vocab,
int n_ctx,
int n_batch,
int n_gpu_layers,
@@ -1047,12 +1057,11 @@ static void llama_model_load_internal(
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
- lctx.t_start_us = ggml_time_us();
+ model.t_start_us = ggml_time_us();
std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only));
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
- auto & model = lctx.model;
+ vocab = std::move(ml->file_loaders.at(0)->vocab);
model.hparams = ml->file_loaders.at(0)->hparams;
model.n_gpu_layers = n_gpu_layers;
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@@ -1122,15 +1131,15 @@ static void llama_model_load_internal(
// create the ggml context
{
- lctx.model.buf.resize(ctx_size);
+ model.buf.resize(ctx_size);
if (use_mlock) {
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
+ model.mlock_buf.init(model.buf.addr);
+ model.mlock_buf.grow_to(model.buf.size);
}
struct ggml_init_params params = {
- /*.mem_size =*/ lctx.model.buf.size,
- /*.mem_buffer =*/ lctx.model.buf.addr,
+ /*.mem_size =*/ model.buf.size,
+ /*.mem_buffer =*/ model.buf.addr,
/*.no_alloc =*/ ml->use_mmap,
};
@@ -1311,7 +1320,7 @@ static void llama_model_load_internal(
}
#endif
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
@@ -1321,12 +1330,13 @@ static void llama_model_load_internal(
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
+ model.t_load_us = ggml_time_us() - model.t_start_us;
}
static bool llama_model_load(
const std::string & fname,
- llama_context & lctx,
+ llama_model & model,
+ llama_vocab & vocab,
int n_ctx,
int n_batch,
int n_gpu_layers,
@@ -1340,7 +1350,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::exception & err) {
@@ -1378,7 +1388,7 @@ static bool llama_eval_internal(
const auto & model = lctx.model;
const auto & hparams = model.hparams;
- const auto & kv_self = model.kv_self;
+ const auto & kv_self = lctx.kv_self;
LLAMA_ASSERT(!!kv_self.ctx);
@@ -1726,7 +1736,7 @@ static bool llama_eval_internal(
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
// update kv token count
- lctx.model.kv_self.n = n_past + N;
+ lctx.kv_self.n = n_past + N;
// extract logits
{
@@ -2634,12 +2644,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// interface implementation
//
-struct llama_context * llama_init_from_file(
+struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_context_params params) {
ggml_time_init();
- llama_context * ctx = new llama_context;
+ llama_model * model = new llama_model;
+
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+ delete model;
+ fprintf(stderr, "%s: failed to load model\n", __func__);
+ return nullptr;
+ }
+
+ return model;
+}
+
+void llama_free_model(struct llama_model * model) {
+ delete model;
+}
+
+struct llama_context * llama_new_context_with_model(
+ struct llama_model * model,
+ struct llama_context_params params) {
+
+ if (!model) {
+ return nullptr;
+ }
+
+ llama_context * ctx = new llama_context(*model, model->vocab);
if (params.seed < 0) {
params.seed = time(NULL);
@@ -2667,24 +2704,16 @@ struct llama_context * llama_init_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
- fprintf(stderr, "%s: failed to load model\n", __func__);
- llama_free(ctx);
- return nullptr;
- }
-
// reserve memory for context buffers
if (!params.vocab_only) {
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
return nullptr;
}
{
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
@@ -2736,8 +2765,8 @@ struct llama_context * llama_init_from_file(
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
@@ -2748,7 +2777,23 @@ struct llama_context * llama_init_from_file(
return ctx;
}
+struct llama_context * llama_init_from_file(
+ const char * path_model,
+ struct llama_context_params params) {
+
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
+ if (!model) {
+ return nullptr;
+ }
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
+ ctx->model_owner = true;
+ return ctx;
+}
+
void llama_free(struct llama_context * ctx) {
+ if (ctx->model_owner) {
+ delete &ctx->model;
+ }
delete ctx;
}
@@ -2765,11 +2810,9 @@ int llama_model_quantize(
}
}
-int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
+int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
- auto & model = ctx->model;
-
const int64_t t_start_lora_us = ggml_time_us();
auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -3012,7 +3055,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
try {
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
+ } catch (const std::exception & err) {
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
+ return 1;
+ }
+}
+
+int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
+ try {
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
} catch (const std::exception & err) {
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1;
@@ -3020,7 +3072,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
}
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
- return ctx->model.kv_self.n;
+ return ctx->kv_self.n;
}
#define LLAMA_MAX_RNG_STATE (64*1024)
@@ -3045,7 +3097,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
const size_t s_kv_size = sizeof(size_t);
const size_t s_kv_ntok = sizeof(int);
- const size_t s_kv = ctx->model.kv_self.buf.size;
+ const size_t s_kv = ctx->kv_self.buf.size;
const size_t s_total = (
+ s_rng_size
@@ -3111,7 +3163,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
// copy kv cache
{
- const auto & kv_self = ctx->model.kv_self;
+ const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd;
@@ -3215,7 +3267,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
// set kv cache
{
- const auto & kv_self = ctx->model.kv_self;
+ const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd;
@@ -3259,7 +3311,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_free(cpy_ctx);
}
- ctx->model.kv_self.n = kv_ntok;
+ ctx->kv_self.n = kv_ntok;
}
const size_t nread = inp - src;
@@ -3506,6 +3558,6 @@ const char * llama_print_system_info(void) {
}
// For internal test use
-std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) {
+const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) {
return ctx->model.tensors_by_name;
}
diff --git a/llama.h b/llama.h
index 0de530d45..a833a7f4d 100644
--- a/llama.h
+++ b/llama.h
@@ -26,6 +26,14 @@
# define LLAMA_API
#endif
+#ifdef __GNUC__
+# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+# define DEPRECATED(func, hint) func
+#endif
+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
@@ -53,6 +61,7 @@ extern "C" {
// TODO: show sample usage
//
+ struct llama_model;
struct llama_context;
typedef int llama_token;
@@ -136,12 +145,23 @@ extern "C" {
LLAMA_API int64_t llama_time_us();
+ LLAMA_API struct llama_model * llama_load_model_from_file(
+ const char * path_model,
+ struct llama_context_params params);
+
+ LLAMA_API void llama_free_model(struct llama_model * model);
+
+ LLAMA_API struct llama_context * llama_new_context_with_model(
+ struct llama_model * model,
+ struct llama_context_params params);
+
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
- LLAMA_API struct llama_context * llama_init_from_file(
+ LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
const char * path_model,
- struct llama_context_params params);
+ struct llama_context_params params),
+ "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
@@ -158,8 +178,15 @@ extern "C" {
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
- LLAMA_API int llama_apply_lora_from_file(
+ LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
struct llama_context * ctx,
+ const char * path_lora,
+ const char * path_base_model,
+ int n_threads),
+ "please use llama_model_apply_lora_from_file instead");
+
+ LLAMA_API int llama_model_apply_lora_from_file(
+ const struct llama_model * model,
const char * path_lora,
const char * path_base_model,
int n_threads);
@@ -310,7 +337,7 @@ extern "C" {
#include
struct ggml_tensor;
-std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx);
+const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index ab1538a0c..20abe7100 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -28,6 +28,7 @@ int main(int argc, char **argv) {
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+ llama_model * model;
llama_context * ctx;
// load the vocab
@@ -36,10 +37,18 @@ int main(int argc, char **argv) {
lparams.vocab_only = true;
- ctx = llama_init_from_file(fname.c_str(), lparams);
+ model = llama_load_model_from_file(fname.c_str(), lparams);
+
+ if (model == NULL) {
+ fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+ return 1;
+ }
+
+ ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+ llama_free_model(model);
return 1;
}
}
@@ -48,6 +57,8 @@ int main(int argc, char **argv) {
if (n_vocab != 32000) {
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
+ llama_free_model(model);
+ llama_free(ctx);
return 2;
}
@@ -77,10 +88,13 @@ int main(int argc, char **argv) {
}
fprintf(stderr, "\n");
+ llama_free_model(model);
+ llama_free(ctx);
return 3;
}
}
+ llama_free_model(model);
llama_free(ctx);
return 0;
From b061ba9e2a7a2c335a200df8c11aed5e31e4ccbb Mon Sep 17 00:00:00 2001
From: Alex Renda
Date: Sat, 24 Jun 2023 03:15:01 -0700
Subject: [PATCH 03/12] llama : fix top-p sampling to match the canonical
definition (#1953)
* Fix top-p sampling to match the standard definition (smallest set that has probability mass at least p, not largest set with probability mass less than p)
* top-p: correct gt to gte
* add test for correct top-p behavior
---
llama.cpp | 7 ++++---
tests/test-sampling.cpp | 1 +
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index a528eef4a..ac22a48f8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2015,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
for (size_t i = 0; i < candidates->size; ++i) {
cum_sum += candidates->data[i].p;
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
- if (cum_sum > p && i >= min_keep) {
- last_idx = i;
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
+ if (cum_sum >= p && i + 1 >= min_keep) {
+ last_idx = i + 1;
break;
}
}
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 5d693f7b5..64f9455d7 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -181,6 +181,7 @@ int main(void) {
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
+ test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
From 235b610d650cbfed6dbd5d671f750d35fc18cd7d Mon Sep 17 00:00:00 2001
From: Alberto <57916483+albbus-stack@users.noreply.github.com>
Date: Sat, 24 Jun 2023 12:32:13 +0200
Subject: [PATCH 04/12] readme : fixed termux instructions (#1973)
---
README.md | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index b09498be6..10462c6b0 100644
--- a/README.md
+++ b/README.md
@@ -680,12 +680,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
-export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
-./main (...)
+export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
+Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
+
### Docker
#### Prerequisites
From 11da1a85cd69af84b5861134738c7e9e20907470 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov
Date: Sat, 24 Jun 2023 13:38:18 +0300
Subject: [PATCH 05/12] readme : fix whitespaces
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 10462c6b0..6aa6ce319 100644
--- a/README.md
+++ b/README.md
@@ -685,7 +685,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
-Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
+Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
### Docker
From f2c754e1c38936fdde74e4848ac468a696eb73c6 Mon Sep 17 00:00:00 2001
From: slaren
Date: Sat, 24 Jun 2023 12:57:18 +0200
Subject: [PATCH 06/12] ggml : improve ggml_graph_dump_dot, add
ggml_format_name (#1978)
* Improve ggml_graph_dump_dot, add ggml_format_name
* add more automatic names to view ops
* fix name of copies
---
ggml.c | 135 ++++++++++++++++++++++++++++++++++++++++-----------------
ggml.h | 1 +
2 files changed, 97 insertions(+), 39 deletions(-)
diff --git a/ggml.c b/ggml.c
index 4319683f5..ef9e8585d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -24,6 +24,7 @@
#include
#include
#include
+#include
#ifdef GGML_USE_METAL
#include
@@ -4734,10 +4735,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
return tensor;
}
+struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
+ va_end(args);
+ return tensor;
+}
+
struct ggml_tensor * ggml_view_tensor(
struct ggml_context * ctx,
const struct ggml_tensor * src) {
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
+ ggml_format_name(result, "%s (view)", src->name);
result->nb[0] = src->nb[0];
result->nb[1] = src->nb[1];
@@ -5899,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl(
// make a view of the destination
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
+ if (strlen(b->name) > 0) {
+ ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
+ } else {
+ ggml_format_name(result, "%s (copy)", a->name);
+ }
result->op = GGML_OP_CPY;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5935,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl(
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+ ggml_format_name(result, "%s (cont)", a->name);
result->op = GGML_OP_CONT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5978,6 +5994,7 @@ struct ggml_tensor * ggml_reshape(
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
+ ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6002,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d(
const int64_t ne[1] = { ne0 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
+ ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6027,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d(
const int64_t ne[2] = { ne0, ne1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
+ ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6053,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d(
const int64_t ne[3] = { ne0, ne1, ne2 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
+ ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6081,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d(
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
+ ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6105,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d(
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
+ ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
@@ -6141,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
+ ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
@@ -6183,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
+ ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
@@ -6227,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
+ ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
@@ -6276,6 +6305,7 @@ struct ggml_tensor * ggml_permute(
}
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+ ggml_format_name(result, "%s (permuted)", a->name);
int ne[GGML_MAX_DIMS];
int nb[GGML_MAX_DIMS];
@@ -6335,6 +6365,7 @@ struct ggml_tensor * ggml_transpose(
}
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+ ggml_format_name(result, "%s (transposed)", a->name);
result->ne[0] = a->ne[1];
result->ne[1] = a->ne[0];
@@ -16004,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
if (strlen(node->name) == 0) {
- snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
+ ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
}
cgraph->leafs[cgraph->n_leafs] = node;
@@ -16013,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
if (strlen(node->name) == 0) {
- snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
+ ggml_format_name(node, "node_%d", cgraph->n_nodes);
}
cgraph->nodes[cgraph->n_nodes] = node;
@@ -17397,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
return NULL;
}
+static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
+ struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
+ struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
+ gparent0 ? (void *) gparent0 : (void *) parent,
+ gparent0 ? "g" : "x",
+ gparent ? (void *) gparent : (void *) node,
+ gparent ? "g" : "x",
+ gparent ? "empty" : "vee",
+ gparent ? "dashed" : "solid",
+ label);
+}
+
+static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
+ (void *) parent, "x",
+ (void *) node, "x",
+ label);
+}
+
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
char color[16];
@@ -17432,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
(void *) node, color);
if (strlen(node->name) > 0) {
- fprintf(fp, "%s |", node->name);
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
+ } else {
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
}
if (node->n_dims == 2) {
@@ -17441,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
}
-
if (node->grad) {
fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
} else {
@@ -17460,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
(void *) node, color);
if (strlen(node->name) > 0) {
- fprintf(fp, "%s | ", node->name);
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
+ } else {
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
}
- if (ggml_nelements(node) == 1) {
- if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
- fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
+
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
+ if (ggml_nelements(node) < 5) {
+ fprintf(fp, " | (");
+ for (int j = 0; j < ggml_nelements(node); j++) {
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
+ fprintf(fp, "%d", ggml_get_i32_1d(node, j));
+ }
+ else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
+ }
+ else {
+ fprintf(fp, "#");
+ }
+ if (j < ggml_nelements(node) - 1) {
+ fprintf(fp, ", ");
+ }
}
- else {
- fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
- }
- }
- else {
- fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
+ fprintf(fp, ")");
}
fprintf(fp, "\"; ]\n");
}
@@ -17479,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
for (int i = 0; i < gb->n_nodes; i++) {
struct ggml_tensor * node = gb->nodes[i];
- struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
-
if (node->src0) {
- struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
-
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
- parent0 ? (void *) parent0 : (void *) node->src0,
- parent0 ? "g" : "x",
- parent ? (void *) parent : (void *) node,
- parent ? "g" : "x",
- parent ? "empty" : "vee",
- parent ? "dashed" : "solid");
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
}
if (node->src1) {
- struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
+ }
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
- parent1 ? (void *) parent1 : (void *) node->src1,
- parent1 ? "g" : "x",
- parent ? (void *) parent : (void *) node,
- parent ? "g" : "x",
- parent ? "empty" : "vee",
- parent ? "dashed" : "solid");
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
+ if (node->opt[j]) {
+ char label[16];
+ snprintf(label, sizeof(label), "opt %d", j);
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
+ }
}
}
@@ -17510,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
struct ggml_tensor * node = gb->leafs[i];
if (node->src0) {
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
- (void *) node->src0, "x",
- (void *) node, "x");
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
}
if (node->src1) {
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
- (void *) node->src1, "x",
- (void *) node, "x");
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
+ }
+
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
+ if (node->opt[j]) {
+ char label[16];
+ snprintf(label, sizeof(label), "opt %d", j);
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
+ }
}
}
diff --git a/ggml.h b/ggml.h
index 18c78551f..4b6b72845 100644
--- a/ggml.h
+++ b/ggml.h
@@ -563,6 +563,7 @@ extern "C" {
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
+ GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
//
// operations on tensors with backpropagation
From c943d823c14cef33092205ca3944de6fdf7abf99 Mon Sep 17 00:00:00 2001
From: AN Long
Date: Sat, 24 Jun 2023 19:02:06 +0800
Subject: [PATCH 07/12] convert : fix invalid params in write_vocab_only
(#1975)
---
convert.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/convert.py b/convert.py
index de6c39c67..e340d2273 100644
--- a/convert.py
+++ b/convert.py
@@ -998,9 +998,9 @@ class OutputFile:
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
- n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
+ n_head=1, n_layer=0)
of = OutputFile(fname_out)
- of.write_file_header(params)
+ of.write_file_header(params, file_type=GGMLFileType.AllF32)
of.write_vocab(vocab)
of.fout.close()
From fdd18609113862dc6eb34dfc44a093d54c59ff1f Mon Sep 17 00:00:00 2001
From: Rowan Hart
Date: Sat, 24 Jun 2023 04:07:08 -0700
Subject: [PATCH 08/12] flake : fix ggml-metal.metal path and run nixfmt
(#1974)
---
flake.nix | 50 ++++++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 24 deletions(-)
diff --git a/flake.nix b/flake.nix
index bba3d71f7..cebb47b94 100644
--- a/flake.nix
+++ b/flake.nix
@@ -9,27 +9,33 @@
inherit (pkgs.stdenv) isAarch64 isDarwin;
inherit (pkgs.lib) optionals;
isM1 = isAarch64 && isDarwin;
- osSpecific =
- if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
- else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
- else [ ];
- pkgs = import nixpkgs {
- inherit system;
- };
- llama-python = pkgs.python310.withPackages (ps: with ps; [
- numpy
- sentencepiece
- ]);
- in
- {
+ osSpecific = if isM1 then
+ with pkgs.darwin.apple_sdk_11_0.frameworks; [
+ Accelerate
+ MetalKit
+ MetalPerformanceShaders
+ MetalPerformanceShadersGraph
+ ]
+ else if isDarwin then
+ with pkgs.darwin.apple_sdk.frameworks; [
+ Accelerate
+ CoreGraphics
+ CoreVideo
+ ]
+ else
+ [ ];
+ pkgs = import nixpkgs { inherit system; };
+ llama-python =
+ pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
+ in {
packages.default = pkgs.stdenv.mkDerivation {
name = "llama.cpp";
src = ./.;
- postPatch =
- if isM1 then ''
- substituteInPlace ./ggml-metal.m \
- --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
- '' else "";
+ postPatch = if isM1 then ''
+ substituteInPlace ./ggml-metal.m \
+ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+ '' else
+ "";
nativeBuildInputs = with pkgs; [ cmake ];
buildInputs = osSpecific;
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
@@ -62,11 +68,7 @@
};
apps.default = self.apps.${system}.llama;
devShells.default = pkgs.mkShell {
- packages = with pkgs; [
- cmake
- llama-python
- ] ++ osSpecific;
+ packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
};
- }
- );
+ });
}
From 65bdd52a867539691007f85c5508146d507f72c1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov
Date: Sat, 24 Jun 2023 19:40:18 +0300
Subject: [PATCH 09/12] tests : sync test-grad0 from ggml
---
tests/test-grad0.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index c8c2c0f71..b5a499c1d 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1,3 +1,4 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h"
#include
@@ -5,6 +6,10 @@
#include
#include
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
#define MAX_NARGS 3
#undef MIN
@@ -197,8 +202,23 @@ bool check_gradient(
float max_error_abs,
float max_error_rel) {
+ static int n_threads = -1;
+ if (n_threads < 0) {
+ n_threads = GGML_DEFAULT_N_THREADS;
+
+ const char *env = getenv("GGML_N_THREADS");
+ if (env) {
+ n_threads = atoi(env);
+ }
+
+ printf("GGML_N_THREADS = %d\n", n_threads);
+ }
+
struct ggml_cgraph gf = ggml_build_forward (f);
+ gf.n_threads = n_threads;
+
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+ gb.n_threads = n_threads;
ggml_graph_compute(ctx0, &gf);
ggml_graph_reset (&gf);
From 5ec8dd5a3c6a9a109351d2257bb9d53869bd0a94 Mon Sep 17 00:00:00 2001
From: Robyn
Date: Sun, 25 Jun 2023 04:10:29 +1000
Subject: [PATCH 10/12] #1869 Fix null reference errors when training from
scratch with CUDA (#1907)
* #1869 Fix null reference errors when training from scratch with CUDA build
Calling ggml_compute_forward when node->src0 was null was causing train-text-from-scratch.exe to terminate unexpectedly.
* ggml : do not dereference src0 if NULL
---------
Co-authored-by: Georgi Gerganov
---
ggml-cuda.cu | 2 +-
ggml.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 36a251ecc..010682edb 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2635,7 +2635,7 @@ void ggml_cuda_free_scratch() {
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
- || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
+ || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
switch (tensor->op) {
diff --git a/ggml.c b/ggml.c
index ef9e8585d..7104be01b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14911,7 +14911,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
if (skip_cpu) {
return;
}
- GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
+ GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS
From e65ca7e14ac76c4046091da39d41a9017abaa9b3 Mon Sep 17 00:00:00 2001
From: sjinzh
Date: Sun, 25 Jun 2023 13:45:44 +0800
Subject: [PATCH 11/12] zig : upgrade build system support (#1981)
* upgrade zig build system support
* zig : add new line at the end of the file
---------
Co-authored-by: Georgi Gerganov
---
build.zig | 87 +++++++++++++++++++++++++++----------------------------
1 file changed, 42 insertions(+), 45 deletions(-)
diff --git a/build.zig b/build.zig
index 306127ffe..49c159ebf 100644
--- a/build.zig
+++ b/build.zig
@@ -1,61 +1,58 @@
const std = @import("std");
+// Zig Version: 0.11.0-dev.3379+629f0d23b
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
- const optimize = b.standardReleaseOptions();
- const want_lto = b.option(bool, "lto", "Want -fLTO");
-
- const lib = b.addStaticLibrary("llama", null);
- lib.want_lto = want_lto;
- lib.setTarget(target);
- lib.setBuildMode(optimize);
+ const optimize = b.standardOptimizeOption(.{});
+ const lib = b.addStaticLibrary(.{
+ .name = "llama",
+ .target = target,
+ .optimize = optimize,
+ });
+ lib.linkLibC();
lib.linkLibCpp();
lib.addIncludePath(".");
- lib.addIncludePath("examples");
+ lib.addIncludePath("./examples");
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
}, &.{"-std=c++11"});
- lib.install();
+ b.installArtifact(lib);
- const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
+ const examples = .{
+ "main",
+ "baby-llama",
+ "embedding",
+ // "metal",
+ "perplexity",
+ "quantize",
+ "quantize-stats",
+ "save-load-state",
+ // "server",
+ "simple",
+ "train-text-from-scratch",
+ };
- const exe = build_example("main", build_args);
- _ = build_example("quantize", build_args);
- _ = build_example("perplexity", build_args);
- _ = build_example("embedding", build_args);
-
- // create "zig build run" command for ./main
-
- const run_cmd = exe.run();
- run_cmd.step.dependOn(b.getInstallStep());
- if (b.args) |args| {
- run_cmd.addArgs(args);
+ inline for (examples) |example_name| {
+ const exe = b.addExecutable(.{
+ .name = example_name,
+ .target = target,
+ .optimize = optimize,
+ });
+ exe.addIncludePath(".");
+ exe.addIncludePath("./examples");
+ exe.addCSourceFiles(&.{
+ std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
+ "examples/common.cpp",
+ }, &.{"-std=c++11"});
+ exe.linkLibrary(lib);
+ b.installArtifact(exe);
+ const run_cmd = b.addRunArtifact(exe);
+ run_cmd.step.dependOn(b.getInstallStep());
+ if (b.args) |args| run_cmd.addArgs(args);
+ const run_step = b.step("run_" ++ example_name, "Run the app");
+ run_step.dependOn(&run_cmd.step);
}
-
- const run_step = b.step("run", "Run the app");
- run_step.dependOn(&run_cmd.step);
-}
-
-fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
- const b = args.b;
- const lib = args.lib;
- const want_lto = args.want_lto;
-
- const exe = b.addExecutable(name, null);
- exe.want_lto = want_lto;
- lib.setTarget(args.target);
- lib.setBuildMode(args.optimize);
- exe.addIncludePath(".");
- exe.addIncludePath("examples");
- exe.addCSourceFiles(&.{
- std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
- "examples/common.cpp",
- }, &.{"-std=c++11"});
- exe.linkLibrary(lib);
- exe.install();
-
- return exe;
}
From 66a2555ba6cab954c56d653b29c27bfbbacfbfb1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov
Date: Sun, 25 Jun 2023 09:07:03 +0300
Subject: [PATCH 12/12] readme : add Azure CI discussion link
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 6aa6ce319..3a71e16db 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
+- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729