Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
commit
c460ff1a1c
172 changed files with 11845 additions and 11211 deletions
61
llama.h
61
llama.h
|
@ -69,6 +69,18 @@ extern "C" {
|
|||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
};
|
||||
|
||||
// pre-tokenization types
|
||||
enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
||||
enum llama_rope_type {
|
||||
|
@ -195,15 +207,19 @@ extern "C" {
|
|||
LLAMA_KV_OVERRIDE_TYPE_INT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
||||
LLAMA_KV_OVERRIDE_TYPE_STR,
|
||||
};
|
||||
|
||||
struct llama_model_kv_override {
|
||||
char key[128];
|
||||
enum llama_model_kv_override_type tag;
|
||||
|
||||
char key[128];
|
||||
|
||||
union {
|
||||
int64_t int_value;
|
||||
double float_value;
|
||||
bool bool_value;
|
||||
int64_t val_i64;
|
||||
double val_f64;
|
||||
bool val_bool;
|
||||
char val_str[128];
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -232,9 +248,10 @@ extern "C" {
|
|||
const struct llama_model_kv_override * kv_overrides;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
};
|
||||
|
||||
struct llama_context_params {
|
||||
|
@ -288,6 +305,7 @@ extern "C" {
|
|||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
} llama_model_quantize_params;
|
||||
|
@ -390,8 +408,10 @@ extern "C" {
|
|||
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
|
@ -833,6 +853,9 @@ extern "C" {
|
|||
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
|
@ -846,7 +869,7 @@ extern "C" {
|
|||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
||||
|
||||
// codellama infill tokens
|
||||
// Codellama infill tokens
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
||||
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
||||
|
@ -875,11 +898,13 @@ extern "C" {
|
|||
// Uses the vocabulary in the provided context.
|
||||
// Does not write null terminator to the buffer.
|
||||
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
||||
// @param special If true, special tokens are rendered in the output.
|
||||
LLAMA_API int32_t llama_token_to_piece(
|
||||
const struct llama_model * model,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length);
|
||||
int32_t length,
|
||||
bool special);
|
||||
|
||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
|
@ -1032,7 +1057,7 @@ extern "C" {
|
|||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
/// @details Randomly selects a token from the candidates based on their probabilities.
|
||||
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
|
||||
LLAMA_API llama_token llama_sample_token(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
@ -1119,8 +1144,9 @@ extern "C" {
|
|||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||
#ifdef LLAMA_API_INTERNAL
|
||||
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct ggml_tensor;
|
||||
|
||||
|
@ -1147,15 +1173,20 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|||
struct llama_context * ctx
|
||||
);
|
||||
|
||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
||||
void llama_grammar_accept(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const uint32_t chr);
|
||||
const uint32_t chr,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
||||
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const std::string & src,
|
||||
llama_partial_utf8 partial_start);
|
||||
|
||||
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
||||
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
||||
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
|
||||
|
||||
#endif // LLAMA_API_INTERNAL
|
||||
|
||||
#endif // LLAMA_H
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue