llama : fix session saving/loading (#3400)

* llama : fix session saving/loading

* llama : temp fix for clearing "future" tokens from the KV cache

* llama : fix handling of "future" tokens when loading sessions

* llama : fix comments for llama_kv_cache API
This commit is contained in:
Georgi Gerganov 2023-10-03 21:04:01 +03:00 committed by GitHub
parent 48be797ffb
commit ac2219fef3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 106 additions and 59 deletions

10
llama.h
View file

@ -42,7 +42,7 @@
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1
#define LLAMA_SESSION_VERSION 2
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@ -333,12 +333,16 @@ extern "C" {
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
// Remove all tokens data of cells in [c0, c1)
// c0 < 0 : [0, c1]
// c1 < 0 : [c0, inf)
LLAMA_API void llama_kv_cache_tokens_rm(
struct llama_context * ctx,
int32_t c0,
int32_t c1);
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
@ -347,6 +351,8 @@ extern "C" {
// Copy all tokens that belong to the specified sequence to another sequence
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
@ -361,6 +367,8 @@ extern "C" {
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_shift(
struct llama_context * ctx,
llama_seq_id seq_id,