llama : fix C compatibility

This commit is contained in:
Cebtenzzre 2023-09-20 23:29:08 -04:00
parent 56abb9a406
commit 43eaf06a2f
3 changed files with 17 additions and 19 deletions

View file

@ -54,8 +54,7 @@ struct gpt_params {
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
// sampling parameters
int32_t top_k = 40; // <= 0 to use vocab size

View file

@ -1706,12 +1706,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
hparams.rope_freq_base = rope_freq_base;
}
llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type;
int8_t rope_scaling_type = params.rope_scaling_type;
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
rope_scaling_type = llama_rope_scaling_type(type);
rope_scaling_type = int8_t(type);
}
GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);
@ -6234,6 +6234,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_batch =*/ 512,
/*.n_gpu_layers =*/ 0,
/*.main_gpu =*/ 0,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
/*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
@ -6241,7 +6242,6 @@ struct llama_context_params llama_context_default_params() {
/*.yarn_attn_factor =*/ 1.0f,
/*.yarn_beta_fast =*/ 32.0f,
/*.yarn_beta_slow =*/ 1.0f,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false,

27
llama.h
View file

@ -108,7 +108,7 @@ extern "C" {
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
enum llama_rope_scaling_type: int8_t {
enum llama_rope_scaling_type {
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
LLAMA_ROPE_SCALING_NONE = 0,
LLAMA_ROPE_SCALING_LINEAR = 1,
@ -131,23 +131,22 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency
float rope_freq_scale; // RoPE frequency scaling factor
float yarn_ext_factor; // YaRN extrapolation mix factor
float yarn_attn_factor; // YaRN magnitude scaling factor
float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim
llama_rope_scaling_type rope_scaling_type;
float rope_freq_base; // RoPE base frequency
float rope_freq_scale; // RoPE frequency scaling factor
float yarn_ext_factor; // YaRN extrapolation mix factor
float yarn_attn_factor; // YaRN magnitude scaling factor
float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;