Merge branch 'master' into concedo_experimental

# Conflicts:
#	README.md
#	build.zig
#	flake.nix
#	tests/test-grad0.c
#	tests/test-sampling.cpp
#	tests/test-tokenizer-0.cpp
This commit is contained in:
Concedo 2023-06-25 17:01:15 +08:00
commit d2034ced7b
19 changed files with 346 additions and 149 deletions

View file

@ -998,9 +998,9 @@ class OutputFile:
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
of = OutputFile(fname_out) of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
n_head=1, n_layer=0, file_type=GGMLFileType.AllF32) n_head=1, n_layer=0)
of = OutputFile(fname_out) of = OutputFile(fname_out)
of.write_file_header(params) of.write_file_header(params, file_type=GGMLFileType.AllF32)
of.write_vocab(vocab) of.write_vocab(vocab)
of.fout.close() of.fout.close()

View file

@ -536,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
return res; return res;
} }
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params(); auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx; lparams.n_ctx = params.n_ctx;
@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
lparams.logits_all = params.perplexity; lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding; lparams.embedding = params.embedding;
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == NULL) {
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return NULL; return std::make_tuple(nullptr, nullptr);
}
llama_context * lctx = llama_new_context_with_model(model, lparams);
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
} }
if (!params.lora_adapter.empty()) { if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(lctx, int err = llama_model_apply_lora_from_file(model,
params.lora_adapter.c_str(), params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(), params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads); params.n_threads);
if (err != 0) { if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return NULL; llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
} }
} }
return lctx; return std::make_tuple(model, lctx);
} }
void console_init(console_state & con_st) { void console_init(console_state & con_st) {

View file

@ -9,6 +9,7 @@
#include <random> #include <random>
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
#include <tuple>
#if !defined (_WIN32) #if !defined (_WIN32)
#include <stdio.h> #include <stdio.h>
@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
// Model utils // Model utils
// //
struct llama_context * llama_init_from_gpt_params(const gpt_params & params); std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
// //
// Console utils // Console utils

View file

@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
llama_init_backend(); llama_init_backend();
llama_model * model;
llama_context * ctx; llama_context * ctx;
// load the model // load the model
ctx = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (ctx == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model);
return 0; return 0;
} }

View file

@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
llama_init_backend(); llama_init_backend();
llama_model * model;
llama_context * ctx; llama_context * ctx;
g_ctx = &ctx; g_ctx = &ctx;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
ctx = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (ctx == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model);
return 0; return 0;
} }
@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
if (params.export_cgraph) { if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml"); llama_eval_export(ctx, "llama.ggml");
llama_free(ctx); llama_free(ctx);
llama_free_model(model);
return 0; return 0;
} }
@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model);
return 0; return 0;
} }

View file

@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
llama_init_backend(); llama_init_backend();
llama_model * model;
llama_context * ctx; llama_context * ctx;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
ctx = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (ctx == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model);
return 0; return 0;
} }

View file

@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "Loading model\n"); fprintf(stderr, "Loading model\n");
const int64_t t_main_start_us = ggml_time_us(); const int64_t t_main_start_us = ggml_time_us();
llama_model * model;
llama_context * ctx; llama_context * ctx;
{ {
@ -330,10 +331,18 @@ int main(int argc, char ** argv) {
lparams.f16_kv = false; lparams.f16_kv = false;
lparams.use_mlock = false; lparams.use_mlock = false;
ctx = llama_init_from_file(params.model.c_str(), lparams); model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return 1; return 1;
} }
} }
@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: error: Quantization should be tested with a float model, " fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
llama_free(ctx); llama_free(ctx);
llama_free_model(model);
return 1; return 1;
} }
included_layers++; included_layers++;
@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
llama_free(ctx); llama_free(ctx);
llama_free_model(model);
// report timing // report timing
{ {
const int64_t t_main_end_us = ggml_time_us(); const int64_t t_main_end_us = ggml_time_us();

View file

@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0); auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
// init // init
auto ctx = llama_init_from_file(params.model.c_str(), lparams); auto model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == nullptr) {
return 1;
}
auto ctx = llama_new_context_with_model(model, lparams);
if (ctx == nullptr) {
llama_free_model(model);
return 1;
}
auto tokens = std::vector<llama_token>(params.n_ctx); auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
if (n_prompt_tokens < 1) { if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
llama_free(ctx);
llama_free_model(model);
return 1; return 1;
} }
@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
printf("%s", next_token_str); printf("%s", next_token_str);
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__); fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx);
llama_free_model(model);
return 1; return 1;
} }
n_past += 1; n_past += 1;
@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
printf("\n\n"); printf("\n\n");
// free old model // free old context
llama_free(ctx); llama_free(ctx);
// load new model // make new context
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); auto ctx2 = llama_new_context_with_model(model, lparams);
// Load state (rng, logits, embedding and kv_cache) from file // Load state (rng, logits, embedding and kv_cache) from file
{ {
FILE *fp_read = fopen("dump_state.bin", "rb"); FILE *fp_read = fopen("dump_state.bin", "rb");
if (state_size != llama_get_state_size(ctx2)) { if (state_size != llama_get_state_size(ctx2)) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__); fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1; return 1;
} }
const size_t ret = fread(state_mem, 1, state_size, fp_read); const size_t ret = fread(state_mem, 1, state_size, fp_read);
if (ret != state_size) { if (ret != state_size) {
fprintf(stderr, "\n%s : failed to read state\n", __func__); fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1; return 1;
} }
@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
printf("%s", next_token_str); printf("%s", next_token_str);
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__); fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1; return 1;
} }
n_past += 1; n_past += 1;
@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
printf("\n\n"); printf("\n\n");
llama_free(ctx2);
llama_free_model(model);
return 0; return 0;
} }

View file

@ -115,6 +115,7 @@ struct llama_server_context {
std::vector<llama_token> embd; std::vector<llama_token> embd;
std::vector<llama_token> last_n_tokens; std::vector<llama_token> last_n_tokens;
llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
gpt_params params; gpt_params params;
@ -130,6 +131,10 @@ struct llama_server_context {
llama_free(ctx); llama_free(ctx);
ctx = nullptr; ctx = nullptr;
} }
if (model) {
llama_free_model(model);
model = nullptr;
}
} }
void rewind() { void rewind() {
@ -150,8 +155,8 @@ struct llama_server_context {
bool loadModel(const gpt_params & params_) { bool loadModel(const gpt_params & params_) {
params = params_; params = params_;
ctx = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (ctx == nullptr) { if (model == nullptr) {
LOG_ERROR("unable to load model", { { "model", params_.model } }); LOG_ERROR("unable to load model", { { "model", params_.model } });
return false; return false;
} }

View file

@ -68,11 +68,12 @@ int main(int argc, char ** argv)
llama_init_backend(); llama_init_backend();
llama_model * model;
llama_context * ctx; llama_context * ctx;
ctx = llama_init_from_gpt_params( params ); std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( ctx == NULL ) if ( model == NULL )
{ {
fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1; return 1;
@ -170,6 +171,7 @@ int main(int argc, char ** argv)
} // wend of main loop } // wend of main loop
llama_free( ctx ); llama_free( ctx );
llama_free_model( model );
return 0; return 0;
} }

View file

@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
struct llama_context_params llama_params = llama_context_default_params(); struct llama_context_params llama_params = llama_context_default_params();
llama_params.vocab_only = true; llama_params.vocab_only = true;
struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params); struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
struct llama_vocab vocab; struct llama_vocab vocab;
{ {
@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
delete[] compute_addr; delete[] compute_addr;
delete[] compute_buf_0; delete[] compute_buf_0;
delete[] compute_buf_1; delete[] compute_buf_1;
llama_free(lctx);
llama_free_model(lmodel);
ggml_free(model.ctx); ggml_free(model.ctx);
return 0; return 0;

View file

@ -2635,7 +2635,7 @@ void ggml_cuda_free_scratch() {
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
ggml_cuda_func_t func; ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|| tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
switch (tensor->op) { switch (tensor->op) {

137
ggml.c
View file

@ -24,6 +24,7 @@
#include <stdio.h> #include <stdio.h>
#include <float.h> #include <float.h>
#include <limits.h> #include <limits.h>
#include <stdarg.h>
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
#include <unistd.h> #include <unistd.h>
@ -4734,10 +4735,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
return tensor; return tensor;
} }
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
va_end(args);
return tensor;
}
struct ggml_tensor * ggml_view_tensor( struct ggml_tensor * ggml_view_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
const struct ggml_tensor * src) { const struct ggml_tensor * src) {
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
ggml_format_name(result, "%s (view)", src->name);
result->nb[0] = src->nb[0]; result->nb[0] = src->nb[0];
result->nb[1] = src->nb[1]; result->nb[1] = src->nb[1];
@ -5899,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl(
// make a view of the destination // make a view of the destination
struct ggml_tensor * result = ggml_view_tensor(ctx, b); struct ggml_tensor * result = ggml_view_tensor(ctx, b);
if (strlen(b->name) > 0) {
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
} else {
ggml_format_name(result, "%s (copy)", a->name);
}
result->op = GGML_OP_CPY; result->op = GGML_OP_CPY;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5935,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl(
} }
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
ggml_format_name(result, "%s (cont)", a->name);
result->op = GGML_OP_CONT; result->op = GGML_OP_CONT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5978,6 +5994,7 @@ struct ggml_tensor * ggml_reshape(
} }
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6002,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d(
const int64_t ne[1] = { ne0 }; const int64_t ne[1] = { ne0 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6027,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d(
const int64_t ne[2] = { ne0, ne1 }; const int64_t ne[2] = { ne0, ne1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6053,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d(
const int64_t ne[3] = { ne0, ne1, ne2 }; const int64_t ne[3] = { ne0, ne1, ne2 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6081,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d(
const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
ggml_format_name(result, "%s (reshaped)", a->name);
result->op = GGML_OP_RESHAPE; result->op = GGML_OP_RESHAPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6105,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d(
} }
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx); ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t)); memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -6141,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx); ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t)); memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -6183,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx); ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t)); memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -6227,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_scratch_save(ctx); ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
ggml_set_name(offs, "offset");
memcpy(offs->data, &offset, 2*sizeof(int32_t)); memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -6276,6 +6305,7 @@ struct ggml_tensor * ggml_permute(
} }
struct ggml_tensor * result = ggml_view_tensor(ctx, a); struct ggml_tensor * result = ggml_view_tensor(ctx, a);
ggml_format_name(result, "%s (permuted)", a->name);
int ne[GGML_MAX_DIMS]; int ne[GGML_MAX_DIMS];
int nb[GGML_MAX_DIMS]; int nb[GGML_MAX_DIMS];
@ -6335,6 +6365,7 @@ struct ggml_tensor * ggml_transpose(
} }
struct ggml_tensor * result = ggml_view_tensor(ctx, a); struct ggml_tensor * result = ggml_view_tensor(ctx, a);
ggml_format_name(result, "%s (transposed)", a->name);
result->ne[0] = a->ne[1]; result->ne[0] = a->ne[1];
result->ne[1] = a->ne[0]; result->ne[1] = a->ne[0];
@ -14880,7 +14911,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
if (skip_cpu) { if (skip_cpu) {
return; return;
} }
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
@ -16004,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
if (strlen(node->name) == 0) { if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
} }
cgraph->leafs[cgraph->n_leafs] = node; cgraph->leafs[cgraph->n_leafs] = node;
@ -16013,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
if (strlen(node->name) == 0) { if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); ggml_format_name(node, "node_%d", cgraph->n_nodes);
} }
cgraph->nodes[cgraph->n_nodes] = node; cgraph->nodes[cgraph->n_nodes] = node;
@ -17397,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
return NULL; return NULL;
} }
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
gparent0 ? (void *) gparent0 : (void *) parent,
gparent0 ? "g" : "x",
gparent ? (void *) gparent : (void *) node,
gparent ? "g" : "x",
gparent ? "empty" : "vee",
gparent ? "dashed" : "solid",
label);
}
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
(void *) parent, "x",
(void *) node, "x",
label);
}
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
char color[16]; char color[16];
@ -17432,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
(void *) node, color); (void *) node, color);
if (strlen(node->name) > 0) { if (strlen(node->name) > 0) {
fprintf(fp, "%s |", node->name); fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
} else {
fprintf(fp, "(%s)|", ggml_type_name(node->type));
} }
if (node->n_dims == 2) { if (node->n_dims == 2) {
@ -17441,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]); fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
} }
if (node->grad) { if (node->grad) {
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
} else { } else {
@ -17460,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
(void *) node, color); (void *) node, color);
if (strlen(node->name) > 0) { if (strlen(node->name) > 0) {
fprintf(fp, "%s | ", node->name); fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
} else {
fprintf(fp, "(%s)|", ggml_type_name(node->type));
} }
if (ggml_nelements(node) == 1) {
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
}
else {
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
}
}
else {
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
if (ggml_nelements(node) < 5) {
fprintf(fp, " | (");
for (int j = 0; j < ggml_nelements(node); j++) {
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
}
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
}
else {
fprintf(fp, "#");
}
if (j < ggml_nelements(node) - 1) {
fprintf(fp, ", ");
}
}
fprintf(fp, ")");
} }
fprintf(fp, "\"; ]\n"); fprintf(fp, "\"; ]\n");
} }
@ -17479,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
for (int i = 0; i < gb->n_nodes; i++) { for (int i = 0; i < gb->n_nodes; i++) {
struct ggml_tensor * node = gb->nodes[i]; struct ggml_tensor * node = gb->nodes[i];
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
if (node->src0) { if (node->src0) {
struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0); ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
parent0 ? (void *) parent0 : (void *) node->src0,
parent0 ? "g" : "x",
parent ? (void *) parent : (void *) node,
parent ? "g" : "x",
parent ? "empty" : "vee",
parent ? "dashed" : "solid");
} }
if (node->src1) { if (node->src1) {
struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1); ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
}
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", for (int j = 0; j < GGML_MAX_OPT; j++) {
parent1 ? (void *) parent1 : (void *) node->src1, if (node->opt[j]) {
parent1 ? "g" : "x", char label[16];
parent ? (void *) parent : (void *) node, snprintf(label, sizeof(label), "opt %d", j);
parent ? "g" : "x", ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
parent ? "empty" : "vee", }
parent ? "dashed" : "solid");
} }
} }
@ -17510,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
struct ggml_tensor * node = gb->leafs[i]; struct ggml_tensor * node = gb->leafs[i];
if (node->src0) { if (node->src0) {
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
(void *) node->src0, "x",
(void *) node, "x");
} }
if (node->src1) { if (node->src1) {
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
(void *) node->src1, "x", }
(void *) node, "x");
for (int j = 0; j < GGML_MAX_OPT; j++) {
if (node->opt[j]) {
char label[16];
snprintf(label, sizeof(label), "opt %d", j);
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
}
} }
} }

1
ggml.h
View file

@ -563,6 +563,7 @@ extern "C" {
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name); GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
// //
// operations on tensors with backpropagation // operations on tensors with backpropagation

View file

@ -78,6 +78,7 @@ static std::vector<int> smartcontext;
static std::vector<std::string> stop_sequence; static std::vector<std::string> stop_sequence;
static std::vector<llama_token_data> top_picks; static std::vector<llama_token_data> top_picks;
static int remaining_tokens = 0; static int remaining_tokens = 0;
static int stopper_unused_tokens = 0;
static std::string concat_output = ""; static std::string concat_output = "";
inline bool IsNanCheck(float f) inline bool IsNanCheck(float f)
@ -759,6 +760,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
bool gpttype_generate_abort() bool gpttype_generate_abort()
{ {
stopper_unused_tokens = remaining_tokens;
remaining_tokens = 0; remaining_tokens = 0;
return true; return true;
} }
@ -899,7 +901,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
current_context_tokens.resize(n_past); current_context_tokens.resize(n_past);
remaining_tokens = params.n_predict; remaining_tokens = params.n_predict;
int stopper_unused_tokens = 0; stopper_unused_tokens = 0;
int input_consumed = 0; int input_consumed = 0;
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
concat_output = ""; concat_output = "";

File diff suppressed because one or more lines are too long

View file

@ -225,7 +225,7 @@ maxhordectx = 1024
maxhordelen = 256 maxhordelen = 256
modelbusy = False modelbusy = False
defaultport = 5001 defaultport = 5001
KcppVersion = "1.32.3" KcppVersion = "1.33"
showdebug = True showdebug = True
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):

177
llama.cpp
View file

@ -182,6 +182,19 @@ struct llama_kv_cache {
} }
}; };
struct llama_vocab {
using id = int32_t;
using token = std::string;
struct token_score {
token tok;
float score;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
};
struct llama_model { struct llama_model {
e_model type = MODEL_UNKNOWN; e_model type = MODEL_UNKNOWN;
@ -198,10 +211,6 @@ struct llama_model {
// context // context
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
// key + value cache for the self attention
// TODO: move to llama_state
struct llama_kv_cache kv_self;
// the model memory buffer // the model memory buffer
llama_ctx_buffer buf; llama_ctx_buffer buf;
@ -215,6 +224,11 @@ struct llama_model {
// for quantize-stats only // for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name; std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
llama_vocab vocab;
~llama_model() { ~llama_model() {
if (ctx) { if (ctx) {
ggml_free(ctx); ggml_free(ctx);
@ -233,24 +247,11 @@ struct llama_model {
} }
}; };
struct llama_vocab {
using id = int32_t;
using token = std::string;
struct token_score {
token tok;
float score;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
};
struct llama_context { struct llama_context {
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
std::mt19937 rng; std::mt19937 rng;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
bool has_evaluated_once = false; bool has_evaluated_once = false;
int64_t t_sample_us = 0; int64_t t_sample_us = 0;
@ -261,8 +262,16 @@ struct llama_context {
int32_t n_eval = 0; // number of eval calls int32_t n_eval = 0; // number of eval calls
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
llama_model model; const llama_model & model;
llama_vocab vocab; const llama_vocab & vocab;
bool model_owner = false;
int64_t t_load_us;
int64_t t_start_us;
// key + value cache for the self attention
struct llama_kv_cache kv_self;
size_t mem_per_token = 0; size_t mem_per_token = 0;
@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) {
static void llama_model_load_internal( static void llama_model_load_internal(
const std::string & fname, const std::string & fname,
llama_context & lctx, llama_model & model,
llama_vocab & vocab,
int n_ctx, int n_ctx,
int n_batch, int n_batch,
int n_gpu_layers, int n_gpu_layers,
@ -1047,12 +1057,11 @@ static void llama_model_load_internal(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void * progress_callback_user_data) { void * progress_callback_user_data) {
lctx.t_start_us = ggml_time_us(); model.t_start_us = ggml_time_us();
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only)); std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); vocab = std::move(ml->file_loaders.at(0)->vocab);
auto & model = lctx.model;
model.hparams = ml->file_loaders.at(0)->hparams; model.hparams = ml->file_loaders.at(0)->hparams;
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
llama_file_version file_version = ml->file_loaders.at(0)->file_version; llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@ -1122,15 +1131,15 @@ static void llama_model_load_internal(
// create the ggml context // create the ggml context
{ {
lctx.model.buf.resize(ctx_size); model.buf.resize(ctx_size);
if (use_mlock) { if (use_mlock) {
lctx.model.mlock_buf.init(lctx.model.buf.addr); model.mlock_buf.init(model.buf.addr);
lctx.model.mlock_buf.grow_to(lctx.model.buf.size); model.mlock_buf.grow_to(model.buf.size);
} }
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ lctx.model.buf.size, /*.mem_size =*/ model.buf.size,
/*.mem_buffer =*/ lctx.model.buf.addr, /*.mem_buffer =*/ model.buf.addr,
/*.no_alloc =*/ ml->use_mmap, /*.no_alloc =*/ ml->use_mmap,
}; };
@ -1311,7 +1320,7 @@ static void llama_model_load_internal(
} }
#endif #endif
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
if (progress_callback) { if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data); progress_callback(1.0f, progress_callback_user_data);
@ -1321,12 +1330,13 @@ static void llama_model_load_internal(
// loading time will be recalculate after the first eval, so // loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration // we take page faults deferred by mmap() into consideration
lctx.t_load_us = ggml_time_us() - lctx.t_start_us; model.t_load_us = ggml_time_us() - model.t_start_us;
} }
static bool llama_model_load( static bool llama_model_load(
const std::string & fname, const std::string & fname,
llama_context & lctx, llama_model & model,
llama_vocab & vocab,
int n_ctx, int n_ctx,
int n_batch, int n_batch,
int n_gpu_layers, int n_gpu_layers,
@ -1340,7 +1350,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void *progress_callback_user_data) { void *progress_callback_user_data) {
try { try {
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
@ -1378,7 +1388,7 @@ static bool llama_eval_internal(
const auto & model = lctx.model; const auto & model = lctx.model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & kv_self = model.kv_self; const auto & kv_self = lctx.kv_self;
LLAMA_ASSERT(!!kv_self.ctx); LLAMA_ASSERT(!!kv_self.ctx);
@ -1726,7 +1736,7 @@ static bool llama_eval_internal(
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
// update kv token count // update kv token count
lctx.model.kv_self.n = n_past + N; lctx.kv_self.n = n_past + N;
// extract logits // extract logits
{ {
@ -2005,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
for (size_t i = 0; i < candidates->size; ++i) { for (size_t i = 0; i < candidates->size; ++i) {
cum_sum += candidates->data[i].p; cum_sum += candidates->data[i].p;
// Check if the running sum is greater than p or if we have kept at least min_keep tokens // Check if the running sum is at least p or if we have kept at least min_keep tokens
if (cum_sum > p && i >= min_keep) { // we set the last index to i+1 to indicate that the current iterate should be included in the set
last_idx = i; if (cum_sum >= p && i + 1 >= min_keep) {
last_idx = i + 1;
break; break;
} }
} }
@ -2634,12 +2645,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// interface implementation // interface implementation
// //
struct llama_context * llama_init_from_file( struct llama_model * llama_load_model_from_file(
const char * path_model, const char * path_model,
struct llama_context_params params) { struct llama_context_params params) {
ggml_time_init(); ggml_time_init();
llama_context * ctx = new llama_context; llama_model * model = new llama_model;
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
delete model;
fprintf(stderr, "%s: failed to load model\n", __func__);
return nullptr;
}
return model;
}
void llama_free_model(struct llama_model * model) {
delete model;
}
struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params) {
if (!model) {
return nullptr;
}
llama_context * ctx = new llama_context(*model, model->vocab);
if (params.seed < 0) { if (params.seed < 0) {
params.seed = time(NULL); params.seed = time(NULL);
@ -2667,24 +2705,16 @@ struct llama_context * llama_init_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
llama_free(ctx);
return nullptr;
}
// reserve memory for context buffers // reserve memory for context buffers
if (!params.vocab_only) { if (!params.vocab_only) {
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
} }
{ {
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
} }
@ -2737,7 +2767,7 @@ struct llama_context * llama_init_from_file(
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
@ -2748,7 +2778,23 @@ struct llama_context * llama_init_from_file(
return ctx; return ctx;
} }
struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params) {
struct llama_model * model = llama_load_model_from_file(path_model, params);
if (!model) {
return nullptr;
}
struct llama_context * ctx = llama_new_context_with_model(model, params);
ctx->model_owner = true;
return ctx;
}
void llama_free(struct llama_context * ctx) { void llama_free(struct llama_context * ctx) {
if (ctx->model_owner) {
delete &ctx->model;
}
delete ctx; delete ctx;
} }
@ -2765,11 +2811,9 @@ int llama_model_quantize(
} }
} }
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
auto & model = ctx->model;
const int64_t t_start_lora_us = ggml_time_us(); const int64_t t_start_lora_us = ggml_time_us();
auto fin = std::ifstream(path_lora, std::ios::binary); auto fin = std::ifstream(path_lora, std::ios::binary);
@ -3012,7 +3056,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
try { try {
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
} catch (const std::exception & err) {
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1;
}
}
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
try {
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
} catch (const std::exception & err) { } catch (const std::exception & err) {
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1; return 1;
@ -3020,7 +3073,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
} }
int llama_get_kv_cache_token_count(const struct llama_context * ctx) { int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
return ctx->model.kv_self.n; return ctx->kv_self.n;
} }
#define LLAMA_MAX_RNG_STATE (64*1024) #define LLAMA_MAX_RNG_STATE (64*1024)
@ -3045,7 +3098,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_embedding = ctx->embedding.size() * sizeof(float);
const size_t s_kv_size = sizeof(size_t); const size_t s_kv_size = sizeof(size_t);
const size_t s_kv_ntok = sizeof(int); const size_t s_kv_ntok = sizeof(int);
const size_t s_kv = ctx->model.kv_self.buf.size; const size_t s_kv = ctx->kv_self.buf.size;
const size_t s_total = ( const size_t s_total = (
+ s_rng_size + s_rng_size
@ -3111,7 +3164,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
// copy kv cache // copy kv cache
{ {
const auto & kv_self = ctx->model.kv_self; const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams; const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd; const int n_embd = hparams.n_embd;
@ -3215,7 +3268,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
// set kv cache // set kv cache
{ {
const auto & kv_self = ctx->model.kv_self; const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams; const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd; const int n_embd = hparams.n_embd;
@ -3259,7 +3312,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_free(cpy_ctx); ggml_free(cpy_ctx);
} }
ctx->model.kv_self.n = kv_ntok; ctx->kv_self.n = kv_ntok;
} }
const size_t nread = inp - src; const size_t nread = inp - src;
@ -3506,6 +3559,6 @@ const char * llama_print_system_info(void) {
} }
// For internal test use // For internal test use
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) { const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
return ctx->model.tensors_by_name; return ctx->model.tensors_by_name;
} }

35
llama.h
View file

@ -26,6 +26,14 @@
# define LLAMA_API # define LLAMA_API
#endif #endif
#ifdef __GNUC__
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define DEPRECATED(func, hint) func
#endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
@ -53,6 +61,7 @@ extern "C" {
// TODO: show sample usage // TODO: show sample usage
// //
struct llama_model;
struct llama_context; struct llama_context;
typedef int llama_token; typedef int llama_token;
@ -136,12 +145,23 @@ extern "C" {
LLAMA_API int64_t llama_time_us(); LLAMA_API int64_t llama_time_us();
LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_context_params params);
LLAMA_API void llama_free_model(struct llama_model * model);
LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params);
// Various functions for loading a ggml llama model. // Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model. // Allocate (almost) all memory needed for the model.
// Return NULL on failure // Return NULL on failure
LLAMA_API struct llama_context * llama_init_from_file( LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
const char * path_model, const char * path_model,
struct llama_context_params params); struct llama_context_params params),
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
// Frees all allocated memory // Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
@ -158,8 +178,15 @@ extern "C" {
// The model needs to be reloaded before applying a new adapter, otherwise the adapter // The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one // will be applied on top of the previous one
// Returns 0 on success // Returns 0 on success
LLAMA_API int llama_apply_lora_from_file( LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
struct llama_context * ctx, struct llama_context * ctx,
const char * path_lora,
const char * path_base_model,
int n_threads),
"please use llama_model_apply_lora_from_file instead");
LLAMA_API int llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora, const char * path_lora,
const char * path_base_model, const char * path_base_model,
int n_threads); int n_threads);
@ -310,7 +337,7 @@ extern "C" {
#include <string> #include <string>
struct ggml_tensor; struct ggml_tensor;
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif #endif