llama : save and restore kv cache for single seq id (#6341)
* llama : save and restore kv cache for single seq id * remove trailing whitespace * respond error in case there's no space in the kv cache * add kv seq save restore to test case * add --slot-save-path arg to enable save restore and restrict save location * Returning 0 for some cases, instead of asserting. * cleanup error cases * rename sequence state functions * rename state get set functions * add previous function names back in with DEPRECATED notice * update doc * adjust endpoints to preferred style * fix restoring zero cell count * handle seq rm return value * unused param * keep in the size check * fix return types * add server test case for slot save restore * cleanup * add cake * cleanup style * add special * removing a whole sequence never fails * move sequence state file functionality from server to llama to match session api and add version tags * catch exceptions on save as well * error log messages * check types for stricter restore * update server doc * readme : update API changes date * strict filename validation * move include, reject bom as well * also reject empty filename * reject whitespace and trailing dot --------- Co-authored-by: Martin Evans <martindevans@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
87fb5b4234
commit
beea6e1b16
11 changed files with 1086 additions and 31 deletions
73
llama.h
73
llama.h
|
@ -37,10 +37,14 @@
|
|||
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 5
|
||||
|
||||
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||
#define LLAMA_STATE_SEQ_VERSION 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -523,6 +527,7 @@ extern "C" {
|
|||
struct llama_context * ctx);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||
// seq_id < 0 : match any sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
|
@ -594,34 +599,92 @@ extern "C" {
|
|||
|
||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||
// and kv_cache) - will often be smaller after compacting tokens
|
||||
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
||||
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
||||
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
||||
"use llama_state_get_size instead");
|
||||
|
||||
// Copies the state to the specified destination address.
|
||||
// Destination needs to have allocated enough memory.
|
||||
// Returns the number of bytes copied
|
||||
LLAMA_API size_t llama_copy_state_data(
|
||||
LLAMA_API size_t llama_state_get_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst);
|
||||
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst),
|
||||
"use llama_state_get_data instead");
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
LLAMA_API size_t llama_set_state_data(
|
||||
LLAMA_API size_t llama_state_set_data(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src);
|
||||
LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src),
|
||||
"use llama_state_set_data instead");
|
||||
|
||||
// Save/load session file
|
||||
LLAMA_API bool llama_load_session_file(
|
||||
LLAMA_API bool llama_state_load_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
llama_token * tokens_out,
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
LLAMA_API DEPRECATED(bool llama_load_session_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
llama_token * tokens_out,
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out),
|
||||
"use llama_state_load_file instead");
|
||||
|
||||
LLAMA_API bool llama_save_session_file(
|
||||
LLAMA_API bool llama_state_save_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
const llama_token * tokens,
|
||||
size_t n_token_count);
|
||||
LLAMA_API DEPRECATED(bool llama_save_session_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_session,
|
||||
const llama_token * tokens,
|
||||
size_t n_token_count),
|
||||
"use llama_state_save_file instead");
|
||||
|
||||
// Get the exact size needed to copy the KV cache of a single sequence
|
||||
LLAMA_API size_t llama_state_seq_get_size(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Copy the KV cache of a single sequence into the specified buffer
|
||||
LLAMA_API size_t llama_state_seq_get_data(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
||||
// Returns:
|
||||
// - Positive: Ok
|
||||
// - Zero: Failed to load
|
||||
LLAMA_API size_t llama_state_seq_set_data(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src,
|
||||
llama_seq_id dest_seq_id);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_save_file(
|
||||
struct llama_context * ctx,
|
||||
const char * filepath,
|
||||
llama_seq_id seq_id,
|
||||
const llama_token * tokens,
|
||||
size_t n_token_count);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_load_file(
|
||||
struct llama_context * ctx,
|
||||
const char * filepath,
|
||||
llama_seq_id dest_seq_id,
|
||||
llama_token * tokens_out,
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
|
||||
//
|
||||
// Decoding
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue