moving to using refl-cpp for llama as well
This commit is contained in:
parent
6fd690fae7
commit
ef4c0f572b
9 changed files with 1914 additions and 1840 deletions
|
@ -122,14 +122,16 @@ int main(int argc, char ** argv) {
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
n_tokens,
|
.n_tokens=n_tokens,
|
||||||
batch.token + i,
|
.token=batch.token + i,
|
||||||
nullptr,
|
.embd=nullptr,
|
||||||
batch.pos + i,
|
.pos=batch.pos + i,
|
||||||
batch.n_seq_id + i,
|
.n_seq_id=batch.n_seq_id + i,
|
||||||
batch.seq_id + i,
|
.seq_id=batch.seq_id + i,
|
||||||
batch.logits + i,
|
.logits=batch.logits + i,
|
||||||
0, 0, 0, // unused
|
.all_pos_0=0,
|
||||||
|
.all_pos_1=0,
|
||||||
|
.all_seq_id=0, // unused
|
||||||
};
|
};
|
||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
|
|
@ -169,10 +169,13 @@ int main(int argc, char ** argv) {
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
candidates.emplace_back(llama_token_data{
|
||||||
|
.id=token_id,
|
||||||
|
.logit=logits[token_id],
|
||||||
|
.p=0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { .data=candidates.data(), .size=candidates.size(), .sorted=false };
|
||||||
|
|
||||||
const int top_k = 40;
|
const int top_k = 40;
|
||||||
const float top_p = 0.9f;
|
const float top_p = 0.9f;
|
||||||
|
|
|
@ -75,7 +75,18 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||||
if (n_eval > n_batch) {
|
if (n_eval > n_batch) {
|
||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
llama_batch batch = {
|
||||||
|
.n_tokens=int32_t(n_eval),
|
||||||
|
.token=nullptr,
|
||||||
|
.embd=(image_embed->embed+i*n_embd),
|
||||||
|
.pos=nullptr,
|
||||||
|
.n_seq_id=nullptr,
|
||||||
|
.seq_id=nullptr,
|
||||||
|
.logits=nullptr,
|
||||||
|
.all_pos_0=*n_past,
|
||||||
|
.all_pos_1=1,
|
||||||
|
.all_seq_id=0
|
||||||
|
};
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -67,9 +67,12 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
candidates.emplace_back(llama_token_data{
|
||||||
|
.id=token_id,
|
||||||
|
.logit=logits[token_id],
|
||||||
|
.p=0.0f});
|
||||||
}
|
}
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p(candidates.data(), candidates.size(), false );
|
||||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||||
|
|
||||||
|
|
|
@ -1667,14 +1667,16 @@ struct llama_server_context
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
llama_batch batch_view =
|
llama_batch batch_view =
|
||||||
{
|
{
|
||||||
n_tokens,
|
.n_tokens=n_tokens,
|
||||||
batch.token + i,
|
.token=batch.token + i,
|
||||||
nullptr,
|
.embd=nullptr,
|
||||||
batch.pos + i,
|
.pos=batch.pos + i,
|
||||||
batch.n_seq_id + i,
|
.n_seq_id=batch.n_seq_id + i,
|
||||||
batch.seq_id + i,
|
.seq_id=batch.seq_id + i,
|
||||||
batch.logits + i,
|
.logits=batch.logits + i,
|
||||||
0, 0, 0, // unused
|
.all_pos_0=.0,
|
||||||
|
.all_pos_1=0,
|
||||||
|
.all_seq_id=0, // unused
|
||||||
};
|
};
|
||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
@ -1722,7 +1724,10 @@ struct llama_server_context
|
||||||
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
llama_token_data_array cur_p = {
|
||||||
|
.data=slot.ctx_sampling->cur.data(),
|
||||||
|
.size=slot.ctx_sampling->cur.size(),
|
||||||
|
.sorted=false };
|
||||||
result.tok = id;
|
result.tok = id;
|
||||||
|
|
||||||
const int32_t n_probs = slot.sparams.n_probs;
|
const int32_t n_probs = slot.sparams.n_probs;
|
||||||
|
|
|
@ -124,10 +124,15 @@ int main(int argc, char ** argv) {
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
candidates.emplace_back(llama_token_data{ .id=token_id,
|
||||||
|
.logit=logits[token_id],
|
||||||
|
.p=0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = {
|
||||||
|
.data=candidates.data(),
|
||||||
|
.size=candidates.size(),
|
||||||
|
.sorted=false };
|
||||||
|
|
||||||
// sample the most likely token
|
// sample the most likely token
|
||||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
158
llama.cpp
158
llama.cpp
|
@ -6745,7 +6745,8 @@ struct llama_grammar * llama_grammar_init(
|
||||||
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
||||||
vec_rules[i].push_back(*pos);
|
vec_rules[i].push_back(*pos);
|
||||||
}
|
}
|
||||||
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
llama_grammar_element ge = {.type=LLAMA_GRETYPE_END, .value=0};
|
||||||
|
vec_rules[i].push_back(ge);
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop over alternates of start rule to build initial stacks
|
// loop over alternates of start rule to build initial stacks
|
||||||
|
@ -7368,7 +7369,15 @@ struct llama_beam {
|
||||||
tokens.resize(tokens.size() - n);
|
tokens.resize(tokens.size() - n);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
llama_beam_view view() const {
|
||||||
|
llama_beam_view bv = {
|
||||||
|
.tokens =tokens.data(),
|
||||||
|
.n_tokens= tokens.size(),
|
||||||
|
.p=p,
|
||||||
|
.eob=eob
|
||||||
|
};
|
||||||
|
return bv;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// A struct for calculating logit-related info.
|
// A struct for calculating logit-related info.
|
||||||
|
@ -7389,7 +7398,12 @@ struct llama_logit_info {
|
||||||
{ }
|
{ }
|
||||||
llama_token_data get_token_data(const llama_token token_id) const {
|
llama_token_data get_token_data(const llama_token token_id) const {
|
||||||
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
||||||
return {token_id, logits[token_id], p};
|
llama_token_data dd {
|
||||||
|
.id = token_id,
|
||||||
|
.logit = logits[token_id],
|
||||||
|
.p = p
|
||||||
|
};
|
||||||
|
return dd;
|
||||||
}
|
}
|
||||||
// Return top k token_data by logit.
|
// Return top k token_data by logit.
|
||||||
std::vector<llama_token_data> top_k(size_t k) {
|
std::vector<llama_token_data> top_k(size_t k) {
|
||||||
|
@ -7529,7 +7543,13 @@ struct llama_beam_search_data {
|
||||||
beam_views[i] = beams[i].view();
|
beam_views[i] = beams[i].view();
|
||||||
}
|
}
|
||||||
common_prefix_length = find_common_prefix_length();
|
common_prefix_length = find_common_prefix_length();
|
||||||
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
llama_beams_state a = {
|
||||||
|
.beam_views=beam_views.data(),
|
||||||
|
.n_beams = beams.size(),
|
||||||
|
.common_prefix_length=common_prefix_length,
|
||||||
|
.last_call=last_call
|
||||||
|
};
|
||||||
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop:
|
// Loop:
|
||||||
|
@ -8356,14 +8376,14 @@ static int llama_apply_lora_from_file_internal(
|
||||||
//
|
//
|
||||||
struct llama_model_params llama_model_default_params() {
|
struct llama_model_params llama_model_default_params() {
|
||||||
struct llama_model_params result = {
|
struct llama_model_params result = {
|
||||||
/*.n_gpu_layers =*/ 0,
|
.n_gpu_layers = 0,
|
||||||
/*.main_gpu =*/ 0,
|
.main_gpu = 0,
|
||||||
/*.tensor_split =*/ nullptr,
|
.tensor_split = nullptr,
|
||||||
/*.progress_callback =*/ nullptr,
|
.progress_callback = nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
.progress_callback_user_data = nullptr,
|
||||||
/*.vocab_only =*/ false,
|
.vocab_only = false,
|
||||||
/*.use_mmap =*/ true,
|
.use_mmap = true,
|
||||||
/*.use_mlock =*/ false,
|
.use_mlock = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -8375,23 +8395,23 @@ struct llama_model_params llama_model_default_params() {
|
||||||
|
|
||||||
struct llama_context_params llama_context_default_params() {
|
struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
.seed = LLAMA_DEFAULT_SEED,
|
||||||
/*.n_ctx =*/ 512,
|
.n_ctx = 512,
|
||||||
/*.n_batch =*/ 512,
|
.n_batch = 512,
|
||||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
.n_threads = GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
.n_threads_batch = GGML_DEFAULT_N_THREADS,
|
||||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
.rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED,
|
||||||
/*.rope_freq_base =*/ 0.0f,
|
.rope_freq_base = 0.0f,
|
||||||
/*.rope_freq_scale =*/ 0.0f,
|
.rope_freq_scale = 0.0f,
|
||||||
/*.yarn_ext_factor =*/ -1.0f,
|
.yarn_ext_factor = -1.0f,
|
||||||
/*.yarn_attn_factor =*/ 1.0f,
|
.yarn_attn_factor = 1.0f,
|
||||||
/*.yarn_beta_fast =*/ 32.0f,
|
.yarn_beta_fast = 32.0f,
|
||||||
/*.yarn_beta_slow =*/ 1.0f,
|
.yarn_beta_slow = 1.0f,
|
||||||
/*.yarn_orig_ctx =*/ 0,
|
.yarn_orig_ctx = 0,
|
||||||
/*.mul_mat_q =*/ true,
|
.mul_mat_q = true,
|
||||||
/*.f16_kv =*/ true,
|
.f16_kv = true,
|
||||||
/*.logits_all =*/ false,
|
.logits_all = false,
|
||||||
/*.embedding =*/ false,
|
.embedding = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -8399,12 +8419,12 @@ struct llama_context_params llama_context_default_params() {
|
||||||
|
|
||||||
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||||
struct llama_model_quantize_params result = {
|
struct llama_model_quantize_params result = {
|
||||||
/*.nthread =*/ 0,
|
.nthread = 0,
|
||||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
.ftype = LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||||
/*.allow_requantize =*/ false,
|
.allow_requantize = false,
|
||||||
/*.quantize_output_tensor =*/ true,
|
.quantize_output_tensor = true,
|
||||||
/*.only_copy =*/ false,
|
.only_copy = false,
|
||||||
/*.pure =*/ false,
|
.pure = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -9301,7 +9321,18 @@ int llama_eval_embd(
|
||||||
int n_past) {
|
int n_past) {
|
||||||
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
||||||
|
|
||||||
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
llama_batch batch = {
|
||||||
|
.n_tokens=n_tokens,
|
||||||
|
.token=nullptr,
|
||||||
|
.embd=embd,
|
||||||
|
.pos=nullptr,
|
||||||
|
.n_seq_id=nullptr,
|
||||||
|
.seq_id=nullptr,
|
||||||
|
.logits=nullptr,
|
||||||
|
.all_pos_0=n_past,
|
||||||
|
.all_pos_1=1,
|
||||||
|
.all_seq_id=0
|
||||||
|
};
|
||||||
|
|
||||||
const int ret = llama_decode_internal(*ctx, batch);
|
const int ret = llama_decode_internal(*ctx, batch);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
|
@ -9321,22 +9352,34 @@ struct llama_batch llama_batch_get_one(
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
llama_pos pos_0,
|
llama_pos pos_0,
|
||||||
llama_seq_id seq_id) {
|
llama_seq_id seq_id) {
|
||||||
return {
|
llama_batch b ={
|
||||||
/*n_tokens =*/ n_tokens,
|
.n_tokens = n_tokens,
|
||||||
/*tokens =*/ tokens,
|
.token = tokens,
|
||||||
/*embd =*/ nullptr,
|
.embd = nullptr,
|
||||||
/*pos =*/ nullptr,
|
.pos = nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
.n_seq_id = nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
.seq_id = nullptr,
|
||||||
/*logits =*/ nullptr,
|
.logits = nullptr,
|
||||||
/*all_pos_0 =*/ pos_0,
|
.all_pos_0 = pos_0,
|
||||||
/*all_pos_1 =*/ 1,
|
.all_pos_1 = 1,
|
||||||
/*all_seq_id =*/ seq_id,
|
.all_seq_id = seq_id,
|
||||||
};
|
};
|
||||||
|
return b;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
||||||
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
llama_batch batch = {
|
||||||
|
.n_tokens = 0,
|
||||||
|
.embd=nullptr,
|
||||||
|
.pos=nullptr,
|
||||||
|
.n_seq_id=nullptr,
|
||||||
|
.seq_id=nullptr,
|
||||||
|
.logits=nullptr,
|
||||||
|
.all_pos_0=0,
|
||||||
|
.all_pos_1=0,
|
||||||
|
.all_seq_id=0
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
if (embd) {
|
if (embd) {
|
||||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
||||||
|
@ -9533,16 +9576,15 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
||||||
|
|
||||||
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||||
struct llama_timings result = {
|
struct llama_timings result = {
|
||||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
.t_start_ms = 1e-3 * ctx->t_start_us,
|
||||||
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
.t_end_ms = 1.00 * ggml_time_ms(),
|
||||||
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
.t_load_ms = 1e-3 * ctx->t_load_us,
|
||||||
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
.t_sample_ms = 1e-3 * ctx->t_sample_us,
|
||||||
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us,
|
||||||
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
.t_eval_ms = 1e-3 * ctx->t_eval_us,
|
||||||
|
.n_sample = std::max(1, ctx->n_sample),
|
||||||
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
.n_p_eval = std::max(1, ctx->n_p_eval),
|
||||||
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
.n_eval = std::max(1, ctx->n_eval),
|
||||||
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
26
llama.h
26
llama.h
|
@ -114,13 +114,19 @@ extern "C" {
|
||||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data : refl::attr::usage::type{
|
||||||
llama_token id; // token id
|
llama_token id; // token id
|
||||||
float logit; // log-odds of the token
|
float logit; // log-odds of the token
|
||||||
float p; // probability of the token
|
float p; // probability of the token
|
||||||
} llama_token_data;
|
} llama_token_data;
|
||||||
|
|
||||||
typedef struct llama_token_data_array {
|
typedef struct llama_token_data_array : refl::attr::usage::type{
|
||||||
|
llama_token_data_array(llama_token_data * data,
|
||||||
|
size_t size,
|
||||||
|
bool sorted):
|
||||||
|
data(data),
|
||||||
|
size(size),
|
||||||
|
sorted(sorted){}
|
||||||
llama_token_data * data;
|
llama_token_data * data;
|
||||||
size_t size;
|
size_t size;
|
||||||
bool sorted;
|
bool sorted;
|
||||||
|
@ -138,7 +144,7 @@ extern "C" {
|
||||||
// - seq_id : the sequence to which the respective token belongs
|
// - seq_id : the sequence to which the respective token belongs
|
||||||
// - logits : if zero, the logits for the respective token will not be output
|
// - logits : if zero, the logits for the respective token will not be output
|
||||||
//
|
//
|
||||||
typedef struct llama_batch {
|
typedef struct llama_batch : refl::attr::usage::type{
|
||||||
int32_t n_tokens;
|
int32_t n_tokens;
|
||||||
|
|
||||||
llama_token * token;
|
llama_token * token;
|
||||||
|
@ -158,7 +164,7 @@ extern "C" {
|
||||||
llama_seq_id all_seq_id; // used if seq_id == NULL
|
llama_seq_id all_seq_id; // used if seq_id == NULL
|
||||||
} llama_batch;
|
} llama_batch;
|
||||||
|
|
||||||
struct llama_model_params {
|
struct llama_model_params : refl::attr::usage::type{
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
@ -174,7 +180,7 @@ extern "C" {
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params : refl::attr::usage::type{
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
uint32_t n_ctx; // text context, 0 = from model
|
uint32_t n_ctx; // text context, 0 = from model
|
||||||
uint32_t n_batch; // prompt processing maximum batch size
|
uint32_t n_batch; // prompt processing maximum batch size
|
||||||
|
@ -199,7 +205,7 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
typedef struct llama_model_quantize_params {
|
typedef struct llama_model_quantize_params : refl::attr::usage::type{
|
||||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
|
@ -237,13 +243,13 @@ extern "C" {
|
||||||
LLAMA_GRETYPE_CHAR_ALT = 6,
|
LLAMA_GRETYPE_CHAR_ALT = 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct llama_grammar_element {
|
typedef struct llama_grammar_element : refl::attr::usage::type{
|
||||||
enum llama_gretype type;
|
enum llama_gretype type;
|
||||||
uint32_t value; // Unicode code point or rule ID
|
uint32_t value; // Unicode code point or rule ID
|
||||||
} llama_grammar_element;
|
} llama_grammar_element;
|
||||||
|
|
||||||
// performance timing information
|
// performance timing information
|
||||||
struct llama_timings {
|
struct llama_timings : refl::attr::usage::type{
|
||||||
double t_start_ms;
|
double t_start_ms;
|
||||||
double t_end_ms;
|
double t_end_ms;
|
||||||
double t_load_ms;
|
double t_load_ms;
|
||||||
|
@ -720,7 +726,7 @@ extern "C" {
|
||||||
// Beam search
|
// Beam search
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_beam_view {
|
struct llama_beam_view : refl::attr::usage::type{
|
||||||
const llama_token * tokens;
|
const llama_token * tokens;
|
||||||
|
|
||||||
size_t n_tokens;
|
size_t n_tokens;
|
||||||
|
@ -732,7 +738,7 @@ extern "C" {
|
||||||
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
||||||
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
||||||
// These pointers are valid only during the synchronous callback, so should not be saved.
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
||||||
struct llama_beams_state {
|
struct llama_beams_state : refl::attr::usage::type{
|
||||||
struct llama_beam_view * beam_views;
|
struct llama_beam_view * beam_views;
|
||||||
|
|
||||||
size_t n_beams; // Number of elements in beam_views[].
|
size_t n_beams; // Number of elements in beam_views[].
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
int main(void) {}
|
|
Loading…
Add table
Add a link
Reference in a new issue