llama : fix C compatibility
This commit is contained in:
parent
56abb9a406
commit
43eaf06a2f
3 changed files with 17 additions and 19 deletions
|
@ -54,8 +54,7 @@ struct gpt_params {
|
||||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
|
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||||
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
|
|
@ -1706,12 +1706,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
|
||||||
hparams.rope_freq_base = rope_freq_base;
|
hparams.rope_freq_base = rope_freq_base;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type;
|
int8_t rope_scaling_type = params.rope_scaling_type;
|
||||||
|
|
||||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
||||||
uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
|
uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
|
||||||
GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
|
GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
|
||||||
rope_scaling_type = llama_rope_scaling_type(type);
|
rope_scaling_type = int8_t(type);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);
|
GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);
|
||||||
|
|
||||||
|
@ -6234,6 +6234,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.n_batch =*/ 512,
|
/*.n_batch =*/ 512,
|
||||||
/*.n_gpu_layers =*/ 0,
|
/*.n_gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
||||||
/*.tensor_split =*/ nullptr,
|
/*.tensor_split =*/ nullptr,
|
||||||
/*.rope_freq_base =*/ 0.0f,
|
/*.rope_freq_base =*/ 0.0f,
|
||||||
/*.rope_freq_scale =*/ 0.0f,
|
/*.rope_freq_scale =*/ 0.0f,
|
||||||
|
@ -6241,7 +6242,6 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.yarn_attn_factor =*/ 1.0f,
|
/*.yarn_attn_factor =*/ 1.0f,
|
||||||
/*.yarn_beta_fast =*/ 32.0f,
|
/*.yarn_beta_fast =*/ 32.0f,
|
||||||
/*.yarn_beta_slow =*/ 1.0f,
|
/*.yarn_beta_slow =*/ 1.0f,
|
||||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
/*.low_vram =*/ false,
|
/*.low_vram =*/ false,
|
||||||
|
|
27
llama.h
27
llama.h
|
@ -108,7 +108,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_scaling_type: int8_t {
|
enum llama_rope_scaling_type {
|
||||||
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
||||||
LLAMA_ROPE_SCALING_NONE = 0,
|
LLAMA_ROPE_SCALING_NONE = 0,
|
||||||
LLAMA_ROPE_SCALING_LINEAR = 1,
|
LLAMA_ROPE_SCALING_LINEAR = 1,
|
||||||
|
@ -131,23 +131,22 @@ extern "C" {
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
int32_t n_ctx; // text context
|
int32_t n_ctx; // text context
|
||||||
int32_t n_batch; // prompt processing batch size
|
int32_t n_batch; // prompt processing batch size
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
|
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
|
|
||||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency
|
float rope_freq_base; // RoPE base frequency
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor
|
float rope_freq_scale; // RoPE frequency scaling factor
|
||||||
float yarn_ext_factor; // YaRN extrapolation mix factor
|
float yarn_ext_factor; // YaRN extrapolation mix factor
|
||||||
float yarn_attn_factor; // YaRN magnitude scaling factor
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast; // YaRN low correction dim
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
|
|
||||||
llama_rope_scaling_type rope_scaling_type;
|
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue