Merge branch 'master' into compilade/refactor-kv-cache

This commit is contained in:
Francis Couture-Harpin 2024-08-31 21:06:32 -04:00
commit bc320ef66d
395 changed files with 57725 additions and 169970 deletions

View file

@ -33,17 +33,15 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 6
#define LLAMA_SESSION_VERSION 8
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
#define LLAMA_STATE_SEQ_VERSION 1
#define LLAMA_STATE_SEQ_VERSION 2
#ifdef __cplusplus
extern "C" {
@ -88,17 +86,22 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_VIKING = 16,
LLAMA_VOCAB_PRE_TYPE_JAIS = 17,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
};
// note: these values should be synchronized with ggml_rope
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
enum llama_rope_type {
LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = 2,
LLAMA_ROPE_TYPE_GLM = 4,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
};
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@ -131,7 +134,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
@ -160,6 +163,9 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
@ -180,6 +186,12 @@ extern "C" {
LLAMA_POOLING_TYPE_LAST = 3,
};
enum llama_attention_type {
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@ -255,9 +267,9 @@ extern "C" {
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_LAYER: ignored
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_MODE_LAYER: ignored
int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@ -292,11 +304,12 @@ extern "C" {
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
@ -332,7 +345,7 @@ extern "C" {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // itoken embeddings tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@ -399,6 +412,9 @@ extern "C" {
const char * content;
} llama_chat_message;
// lora adapter
struct llama_lora_adapter;
// Helpers for getting default parameters
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
@ -412,6 +428,13 @@ extern "C" {
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
@ -488,28 +511,48 @@ extern "C" {
// Returns true if the model contains an encoder that requires llama_encode() call
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
// Returns true if the model contains a decoder that requires llama_decode() call
LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
// For encoder-decoder models, this function returns id of the token that must be provided
// to the decoder to start generating output sequence. For other models, it returns -1.
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
// Returns 0 on success
LLAMA_API uint32_t llama_model_quantize(
const char * fname_inp,
const char * fname_out,
const llama_model_quantize_params * params);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API int32_t llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
float scale,
const char * path_base_model,
int32_t n_threads);
// Load a LoRA adapter from file
// The loaded adapter will be associated to the given model, and will be free when the model is deleted
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
struct llama_model * model,
const char * path_lora);
// Add a loaded LoRA adapter to given context
// This will not modify model's weight
LLAMA_API int32_t llama_lora_adapter_set(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
float scale);
// Remove a specific LoRA adapter from given context
// Return -1 if the adapter is not present in the context
LLAMA_API int32_t llama_lora_adapter_remove(
struct llama_context * ctx,
struct llama_lora_adapter * adapter);
// Remove all LoRA adapters from given context
LLAMA_API void llama_lora_adapter_clear(
struct llama_context * ctx);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
// the currently loaded vector.
@ -709,10 +752,11 @@ extern "C" {
// State / sessions
//
// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
// Returns the *actual* size in bytes of the state
// (rng, logits, embedding and kv_cache)
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
"use llama_state_get_size instead");
// Copies the state to the specified destination address.
@ -720,7 +764,8 @@ extern "C" {
// Returns the number of bytes copied
LLAMA_API size_t llama_state_get_data(
struct llama_context * ctx,
uint8_t * dst);
uint8_t * dst,
size_t size);
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
struct llama_context * ctx,
uint8_t * dst),
@ -730,7 +775,8 @@ extern "C" {
// Returns the number of bytes read
LLAMA_API size_t llama_state_set_data(
struct llama_context * ctx,
const uint8_t * src);
const uint8_t * src,
size_t size);
LLAMA_API DEPRECATED(size_t llama_set_state_data(
struct llama_context * ctx,
const uint8_t * src),
@ -772,6 +818,7 @@ extern "C" {
LLAMA_API size_t llama_state_seq_get_data(
struct llama_context * ctx,
uint8_t * dst,
size_t size,
llama_seq_id seq_id);
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@ -781,6 +828,7 @@ extern "C" {
LLAMA_API size_t llama_state_seq_set_data(
struct llama_context * ctx,
const uint8_t * src,
size_t size,
llama_seq_id dest_seq_id);
LLAMA_API size_t llama_state_seq_save_file(
@ -846,13 +894,13 @@ extern "C" {
// Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
// Get the number of threads used for generation of a single token.
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
// Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not
@ -927,11 +975,8 @@ extern "C" {
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
// Codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@ -947,6 +992,7 @@ extern "C" {
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
/// @return Returns the number of tokens on success, no more than n_tokens_max
/// @return Returns a negative number on failure - the number of tokens that would have been returned
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
/// as plaintext. Does not insert a leading space.
LLAMA_API int32_t llama_tokenize(
@ -961,15 +1007,35 @@ extern "C" {
// Token Id -> Piece.
// Uses the vocabulary in the provided context.
// Does not write null terminator to the buffer.
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
// @param special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_token_to_piece(
const struct llama_model * model,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
/// @param text The char pointer must be large enough to hold the resulting text.
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
/// @param unparse_special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_detokenize(
const struct llama_model * model,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);
//
// Chat templates
//
/// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@ -1008,6 +1074,23 @@ extern "C" {
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
/// @details Apply constraints from grammar
LLAMA_API void llama_grammar_sample(
const struct llama_grammar * grammar,
const struct llama_context * ctx,
llama_token_data_array * candidates);
LLAMA_API DEPRECATED(void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar),
"use llama_grammar_sample instead");
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_grammar * grammar,
struct llama_context * ctx,
llama_token token);
//
// Sampling functions
//
@ -1089,12 +1172,6 @@ extern "C" {
llama_token_data_array * candidates,
float temp);
/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -1132,12 +1209,6 @@ extern "C" {
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_context * ctx,
struct llama_grammar * grammar,
llama_token token);
//
// Model split
//
@ -1180,38 +1251,45 @@ extern "C" {
struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
using llama_grammar_rule = std::vector< llama_grammar_element>;
using llama_grammar_stack = std::vector<const llama_grammar_element *>;
using llama_grammar_rules = std::vector<llama_grammar_rule>;
using llama_grammar_stacks = std::vector<llama_grammar_stack>;
using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
void llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
const llama_grammar_rules & rules,
const llama_grammar_stacks & stacks,
const uint32_t chr,
llama_grammar_stacks & new_stacks);
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
const llama_grammar_rules & rules,
const llama_grammar_stack & stack,
const llama_grammar_candidates & candidates);
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,
llama_partial_utf8 partial_start);
llama_partial_utf8 partial_start);
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.