llama, main: save state incrementally

This commit is contained in:
Evan Jones 2023-05-03 02:09:19 -04:00
parent 799fdc1b5d
commit 3f30da38ad
4 changed files with 210 additions and 117 deletions

View file

@ -140,9 +140,12 @@ int main(int argc, char ** argv) {
// Add a space in front of the first character to match OG llama tokenizer behavior // Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' '); params.prompt.insert(0, 1, ' ');
std::string path_session = params.path_session; // tokenize the prompt
std::vector<llama_token> session_tokens; auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
// restore prompt from saved session
const std::string path_session = params.path_session;
int n_matching_session_tokens = 0;
if (!path_session.empty()) { if (!path_session.empty()) {
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
@ -151,7 +154,7 @@ int main(int argc, char ** argv) {
if (fp != NULL) { if (fp != NULL) {
std::fclose(fp); std::fclose(fp);
session_tokens.resize(params.n_ctx); std::vector<llama_token> session_tokens(embd_inp.size());
size_t n_token_count_out = 0; size_t n_token_count_out = 0;
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@ -159,15 +162,28 @@ int main(int argc, char ** argv) {
} }
session_tokens.resize(n_token_count_out); session_tokens.resize(n_token_count_out);
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); // find matching input prefix from saved session
for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= (int) embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break;
}
n_matching_session_tokens++;
}
if (n_matching_session_tokens >= (int) embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (int) (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%d / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size());
} else {
fprintf(stderr, "%s: session file matches %d / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size());
}
} else { } else {
fprintf(stderr, "%s: session file does not exist, will create\n", __func__); fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
} }
} }
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
if ((int) embd_inp.size() > n_ctx - 4) { if ((int) embd_inp.size() > n_ctx - 4) {
@ -175,25 +191,6 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
// debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0;
if (session_tokens.size()) {
for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break;
}
n_matching_session_tokens++;
}
if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size());
} else {
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size());
}
}
// number of tokens to keep when resetting context // number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
@ -283,16 +280,11 @@ int main(int argc, char ** argv) {
bool is_antiprompt = false; bool is_antiprompt = false;
bool input_echo = true; bool input_echo = true;
// HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
// if we loaded a session with at least 75% similarity. It's currently just used to speed up the
// initial prompt so it doesn't need to be an exact match.
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
int n_past = 0; int n_past = 0;
int n_remain = params.n_predict; int n_remain = params.n_predict;
int n_consumed = 0; int n_consumed = 0;
int n_session_consumed = 0; int n_session_consumed = 0;
int n_session_write_past = 0;
// the first thing we will do is to output the prompt, so set color accordingly // the first thing we will do is to output the prompt, so set color accordingly
set_console_color(con_st, CONSOLE_COLOR_PROMPT); set_console_color(con_st, CONSOLE_COLOR_PROMPT);
@ -306,7 +298,8 @@ int main(int argc, char ** argv) {
// if we run out of context: // if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past) // - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) { bool needs_swap = n_past + (int) embd.size() > n_ctx;
if (needs_swap) {
const int n_left = n_past - params.n_keep; const int n_left = n_past - params.n_keep;
n_past = params.n_keep; n_past = params.n_keep;
@ -314,9 +307,6 @@ int main(int argc, char ** argv) {
// insert n_left/2 tokens at the start of embd from last_n_tokens // insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
// stop saving session if we run out of context
path_session = "";
//printf("\n---\n"); //printf("\n---\n");
//printf("resetting: '"); //printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) { //for (int i = 0; i < (int) embd.size(); i++) {
@ -326,27 +316,12 @@ int main(int argc, char ** argv) {
//printf("\n---\n"); //printf("\n---\n");
} }
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) // skip evaluation of tokens in the input prefix that matched session
// REVIEW if (n_session_consumed < n_matching_session_tokens) {
if (n_session_consumed < (int) session_tokens.size()) { int n_skip = std::min((int) embd.size(), n_matching_session_tokens - n_session_consumed);
size_t i = 0; embd.erase(embd.begin(), embd.begin() + n_skip);
for ( ; i < embd.size(); i++) { n_session_consumed += n_skip;
if (embd[i] != session_tokens[n_session_consumed]) { n_past += n_skip;
session_tokens.resize(n_session_consumed);
break;
}
n_past++;
n_session_consumed++;
if (n_session_consumed >= (int) session_tokens.size()) {
++i;
break;
}
}
if (i > 0) {
embd.erase(embd.begin(), embd.begin() + i);
}
} }
// evaluate tokens in batches // evaluate tokens in batches
@ -363,14 +338,42 @@ int main(int argc, char ** argv) {
n_past += n_eval; n_past += n_eval;
} }
if (embd.size() > 0 && !path_session.empty()) { // save session after context swap
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); if (!path_session.empty() && needs_swap) {
n_session_consumed = session_tokens.size(); int n_tokens = n_past - params.n_keep;
if (!llama_append_session_file(
ctx, path_session.c_str(), params.n_keep,
last_n_tokens.data() + last_n_tokens.size() - n_tokens, n_tokens)) {
fprintf(stderr, "%s: error: unable to write to session file '%s'\n",
__func__, path_session.c_str());
return 1;
}
n_session_write_past = n_past;
} }
} }
embd.clear(); embd.clear();
// save prompt evaluation state to session file
if (!path_session.empty() && !n_session_write_past && (int) embd_inp.size() <= n_consumed) {
if (!llama_init_session_file(ctx, path_session.c_str())) {
fprintf(stderr, "%s: error: unable to start session file '%s'\n",
__func__, path_session.c_str());
return 1;
}
if (!llama_append_session_file(
ctx, path_session.c_str(), 0,
last_n_tokens.data() + last_n_tokens.size() - n_past, n_past)) {
fprintf(stderr, "%s: error: unable to write to session file '%s'\n",
__func__, path_session.c_str());
return 1;
}
n_session_write_past = n_past;
}
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token // out of user input, sample next token
const float temp = params.temp; const float temp = params.temp;
@ -387,12 +390,6 @@ int main(int argc, char ** argv) {
const float mirostat_eta = params.mirostat_eta; const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl; const bool penalize_nl = params.penalize_nl;
// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session) {
need_to_save_session = false;
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
llama_token id = 0; llama_token id = 0;
{ {
@ -608,6 +605,20 @@ int main(int argc, char ** argv) {
} }
} }
if (!path_session.empty()) {
int n_session_remain = n_past - n_session_write_past;
fprintf(stderr, "\n%s: saving remaining state (%d tokens) to session file '%s'",
__func__, n_session_remain, path_session.c_str());
if (!llama_append_session_file(
ctx, path_session.c_str(), n_session_write_past,
last_n_tokens.data() + last_n_tokens.size() - embd.size() - n_session_remain,
n_session_remain)) {
fprintf(stderr, "%s: error: unable to write to session file '%s'\n",
__func__, path_session.c_str());
return 1;
}
}
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);

View file

@ -58,7 +58,7 @@ int main(int argc, char ** argv) {
// Save state (rng, logits, embedding and kv_cache) to file // Save state (rng, logits, embedding and kv_cache) to file
{ {
FILE *fp_write = fopen("dump_state.bin", "wb"); FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file llama_copy_state_data(ctx, state_mem, 0); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write); fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write); fclose(fp_write);
} }

125
llama.cpp
View file

@ -2436,7 +2436,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
} }
// Copies the state to the specified destination address // Copies the state to the specified destination address
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest, int n_token_offset) {
uint8_t * out = dest; uint8_t * out = dest;
// copy rng // copy rng
@ -2494,29 +2494,36 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
memcpy(out, &n_token_offset, sizeof(n_token_offset)); out += sizeof(n_token_offset);
if (kv_size) { LLAMA_ASSERT(n_token_offset <= kv_ntok);
if (kv_size && n_token_offset < kv_ntok) {
const int n_tokens = kv_ntok - n_token_offset;
const size_t elt_size = ggml_element_size(kv_self.k); const size_t elt_size = ggml_element_size(kv_self.k);
char buffer[4096]; char buffer[4096];
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{}; ggml_cgraph gf{};
gf.n_threads = 1; gf.n_threads = 1;
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, n_tokens, n_layer);
kout3d->data = out; kout3d->data = out;
out += ggml_nbytes(kout3d); out += ggml_nbytes(kout3d);
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, n_tokens, n_embd, n_layer);
vout3d->data = out; vout3d->data = out;
out += ggml_nbytes(vout3d); out += ggml_nbytes(vout3d);
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
n_embd, kv_ntok, n_layer, n_embd, n_tokens, n_layer,
elt_size*n_embd, elt_size*n_embd*n_ctx, 0); elt_size*n_embd, elt_size*n_embd*n_ctx,
elt_size*n_embd*n_token_offset);
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
kv_ntok, n_embd, n_layer, n_tokens, n_embd, n_layer,
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); elt_size*n_ctx, elt_size*n_ctx*n_embd,
elt_size*n_token_offset);
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
@ -2593,34 +2600,42 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
size_t kv_size; size_t kv_size;
int kv_ntok; int kv_ntok;
int n_token_offset;
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size); memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
memcpy(&n_token_offset, in, sizeof(n_token_offset)); in += sizeof(n_token_offset);
if (kv_size) { LLAMA_ASSERT(n_token_offset <= kv_ntok);
if (kv_size && n_token_offset < kv_ntok) {
LLAMA_ASSERT(kv_self.buf.size == kv_size); LLAMA_ASSERT(kv_self.buf.size == kv_size);
const int n_tokens = kv_ntok - n_token_offset;
const size_t elt_size = ggml_element_size(kv_self.k); const size_t elt_size = ggml_element_size(kv_self.k);
char buffer[4096]; char buffer[4096];
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{}; ggml_cgraph gf{};
gf.n_threads = 1; gf.n_threads = 1;
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, n_tokens, n_layer);
kin3d->data = (void *) in; kin3d->data = (void *) in;
in += ggml_nbytes(kin3d); in += ggml_nbytes(kin3d);
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, n_tokens, n_embd, n_layer);
vin3d->data = (void *) in; vin3d->data = (void *) in;
in += ggml_nbytes(vin3d); in += ggml_nbytes(vin3d);
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
n_embd, kv_ntok, n_layer, n_embd, n_tokens, n_layer,
elt_size*n_embd, elt_size*n_embd*n_ctx, 0); elt_size*n_embd, elt_size*n_embd*n_ctx,
elt_size*n_embd*n_token_offset);
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
kv_ntok, n_embd, n_layer, n_tokens, n_embd, n_layer,
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); elt_size*n_ctx, elt_size*n_ctx*n_embd,
elt_size*n_token_offset);
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
@ -2638,7 +2653,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
return nread; return nread;
} }
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { bool llama_load_session_file(
struct llama_context * ctx,
const char * path_session,
llama_token * tokens_out,
size_t n_token_capacity,
size_t * n_token_count_out) {
llama_file file(path_session, "rb"); llama_file file(path_session, "rb");
// sanity checks // sanity checks
@ -2660,39 +2680,70 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
} }
} }
// load the prompt const size_t n_state_size_max = llama_get_state_size(ctx);
{ size_t n_token_count = 0;
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) { std::vector<uint8_t> state_data(n_state_size_max);
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return false; // read N segments of (tokens + state), until end or tokens_out filled
while (file.size > file.tell()) {
// load the prompt/tokens
const uint32_t n_token_segment = file.read_u32();
const size_t n_token_read =
std::min((size_t) n_token_segment, n_token_capacity - n_token_count);
file.read_raw(tokens_out + n_token_count, sizeof(llama_token) * n_token_read);
n_token_count += n_token_read;
if (n_token_segment > n_token_read) {
const size_t n_token_extra = n_token_segment - n_token_read;
file.seek(sizeof(llama_token) * n_token_extra, SEEK_CUR);
} }
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); LLAMA_ASSERT(n_token_count <= n_token_capacity);
*n_token_count_out = n_token_count;
}
// restore the context state // restore the context state
{ {
const size_t n_state_size_cur = file.size - file.tell(); size_t n_state_size_cur;
const size_t n_state_size_max = llama_get_state_size(ctx); file.read_raw(&n_state_size_cur, sizeof(n_state_size_cur));
if (n_state_size_cur > n_state_size_max) { if (n_state_size_cur > n_state_size_max) {
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
return false; return false;
} }
std::vector<uint8_t> state_data(n_state_size_max);
file.read_raw(state_data.data(), n_state_size_cur); file.read_raw(state_data.data(), n_state_size_cur);
llama_set_state_data(ctx, state_data.data()); llama_set_state_data(ctx, state_data.data());
} }
if (n_token_count == n_token_capacity) {
// the logits for this segment apply to the last token; if we didn't read a full
// segment, move back one token to force an eval to get accurate logits
if (n_token_read < n_token_segment) {
n_token_count--;
}
break;
}
}
*n_token_count_out = n_token_count;
return true; return true;
} }
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { bool llama_save_session_file(
struct llama_context * ctx,
const char * path_session,
const llama_token * tokens,
size_t n_token_count) {
return (
llama_init_session_file(ctx, path_session) &&
llama_append_session_file(ctx, path_session, 0, tokens, n_token_count)
);
}
bool llama_init_session_file(struct llama_context * ctx, const char * path_session) {
llama_file file(path_session, "wb"); llama_file file(path_session, "wb");
file.write_u32(LLAMA_SESSION_MAGIC); file.write_u32(LLAMA_SESSION_MAGIC);
@ -2700,6 +2751,17 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
return true;
}
bool llama_append_session_file(
struct llama_context * ctx,
const char * path_session,
int n_token_offset,
const llama_token * tokens,
size_t n_token_count) {
llama_file file(path_session, "ab");
// save the prompt // save the prompt
file.write_u32((uint32_t) n_token_count); file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count); file.write_raw(tokens, sizeof(llama_token) * n_token_count);
@ -2709,8 +2771,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
const size_t n_state_size_max = llama_get_state_size(ctx); const size_t n_state_size_max = llama_get_state_size(ctx);
std::vector<uint8_t> state_data(n_state_size_max); std::vector<uint8_t> state_data(n_state_size_max);
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data()); const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data(), n_token_offset);
file.write_raw(&n_state_size_cur, sizeof(n_state_size_cur));
file.write_raw(state_data.data(), n_state_size_cur); file.write_raw(state_data.data(), n_state_size_cur);
} }

27
llama.h
View file

@ -23,7 +23,7 @@
#define LLAMA_FILE_MAGIC 'ggjt' #define LLAMA_FILE_MAGIC 'ggjt'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
#define LLAMA_SESSION_MAGIC 'ggsn' #define LLAMA_SESSION_MAGIC 'ggsn'
#define LLAMA_SESSION_VERSION 1 #define LLAMA_SESSION_VERSION 2
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -134,15 +134,34 @@ extern "C" {
// Copies the state to the specified destination address. // Copies the state to the specified destination address.
// Destination needs to have allocated enough memory. // Destination needs to have allocated enough memory.
// Returns the number of bytes copied // Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest); LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest, int n_token_offset);
// Set the state reading from the specified address // Set the state reading from the specified address
// Returns the number of bytes read // Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
// Save/load session file // Save/load session file
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); LLAMA_API bool llama_load_session_file(
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); struct llama_context * ctx,
const char * path_session,
llama_token * tokens_out,
size_t n_token_capacity,
size_t * n_token_count_out);
LLAMA_API bool llama_save_session_file(
struct llama_context * ctx,
const char * path_session,
const llama_token * tokens,
size_t n_token_count);
LLAMA_API bool llama_init_session_file(struct llama_context * ctx, const char * path_session);
LLAMA_API bool llama_append_session_file(
struct llama_context * ctx,
const char * path_session,
int n_token_offset,
const llama_token * tokens,
size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token. // Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process // tokens + n_tokens is the provided batch of new tokens to process