llama : sync gguf-llama.cpp with latest llama.cpp

This commit is contained in:
Georgi Gerganov 2023-08-14 13:44:37 +03:00
parent 62490f1380
commit f4a0e0ec5a
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 469 additions and 219 deletions

File diff suppressed because it is too large Load diff

View file

@ -41,10 +41,6 @@
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif
#ifndef LLAMA_DEFAULT_RMS_EPS
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
#endif
#ifdef __cplusplus
extern "C" {
#endif
@ -74,12 +70,23 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
enum llama_log_level {
LLAMA_LOG_LEVEL_ERROR = 2,
LLAMA_LOG_LEVEL_WARN = 3,
LLAMA_LOG_LEVEL_INFO = 4
};
// Signature for logging events
// Note that text includes the new line character at the end for most events.
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
// if it exists.
// It might not exist for progress report where '.' is output repeatedly.
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
@ -96,6 +103,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool low_vram; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
@ -182,6 +190,10 @@ extern "C" {
int32_t n_eval;
};
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
LLAMA_API int llama_max_devices();
LLAMA_API struct llama_context_params llama_context_default_params();