moving to using refl-cpp for llama as well

This commit is contained in:
mike dupont 2023-11-22 11:40:25 -05:00
parent 6fd690fae7
commit ef4c0f572b
9 changed files with 1914 additions and 1840 deletions

View file

@ -122,14 +122,16 @@ int main(int argc, char ** argv) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = { llama_batch batch_view = {
n_tokens, .n_tokens=n_tokens,
batch.token + i, .token=batch.token + i,
nullptr, .embd=nullptr,
batch.pos + i, .pos=batch.pos + i,
batch.n_seq_id + i, .n_seq_id=batch.n_seq_id + i,
batch.seq_id + i, .seq_id=batch.seq_id + i,
batch.logits + i, .logits=batch.logits + i,
0, 0, 0, // unused .all_pos_0=0,
.all_pos_1=0,
.all_seq_id=0, // unused
}; };
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);

View file

@ -169,10 +169,13 @@ int main(int argc, char ** argv) {
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); candidates.emplace_back(llama_token_data{
.id=token_id,
.logit=logits[token_id],
.p=0.0f });
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { .data=candidates.data(), .size=candidates.size(), .sorted=false };
const int top_k = 40; const int top_k = 40;
const float top_p = 0.9f; const float top_p = 0.9f;

View file

@ -75,7 +75,18 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) { if (n_eval > n_batch) {
n_eval = n_batch; n_eval = n_batch;
} }
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; llama_batch batch = {
.n_tokens=int32_t(n_eval),
.token=nullptr,
.embd=(image_embed->embed+i*n_embd),
.pos=nullptr,
.n_seq_id=nullptr,
.seq_id=nullptr,
.logits=nullptr,
.all_pos_0=*n_past,
.all_pos_1=1,
.all_seq_id=0
};
if (llama_decode(ctx_llama, batch)) { if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return false; return false;

View file

@ -67,9 +67,12 @@ int main(int argc, char ** argv) {
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); candidates.emplace_back(llama_token_data{
.id=token_id,
.logit=logits[token_id],
.p=0.0f});
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p(candidates.data(), candidates.size(), false );
auto next_token = llama_sample_token(ctx, &candidates_p); auto next_token = llama_sample_token(ctx, &candidates_p);
auto next_token_str = llama_token_to_piece(ctx, next_token); auto next_token_str = llama_token_to_piece(ctx, next_token);

View file

@ -1667,14 +1667,16 @@ struct llama_server_context
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = llama_batch batch_view =
{ {
n_tokens, .n_tokens=n_tokens,
batch.token + i, .token=batch.token + i,
nullptr, .embd=nullptr,
batch.pos + i, .pos=batch.pos + i,
batch.n_seq_id + i, .n_seq_id=batch.n_seq_id + i,
batch.seq_id + i, .seq_id=batch.seq_id + i,
batch.logits + i, .logits=batch.logits + i,
0, 0, 0, // unused .all_pos_0=.0,
.all_pos_1=0,
.all_seq_id=0, // unused
}; };
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
@ -1722,7 +1724,10 @@ struct llama_server_context
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
} }
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; llama_token_data_array cur_p = {
.data=slot.ctx_sampling->cur.data(),
.size=slot.ctx_sampling->cur.size(),
.sorted=false };
result.tok = id; result.tok = id;
const int32_t n_probs = slot.sparams.n_probs; const int32_t n_probs = slot.sparams.n_probs;

View file

@ -124,10 +124,15 @@ int main(int argc, char ** argv) {
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); candidates.emplace_back(llama_token_data{ .id=token_id,
.logit=logits[token_id],
.p=0.0f });
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = {
.data=candidates.data(),
.size=candidates.size(),
.sorted=false };
// sample the most likely token // sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

3648
llama.cpp

File diff suppressed because it is too large Load diff

26
llama.h
View file

@ -114,13 +114,19 @@ extern "C" {
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
}; };
typedef struct llama_token_data { typedef struct llama_token_data : refl::attr::usage::type{
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
float p; // probability of the token float p; // probability of the token
} llama_token_data; } llama_token_data;
typedef struct llama_token_data_array { typedef struct llama_token_data_array : refl::attr::usage::type{
llama_token_data_array(llama_token_data * data,
size_t size,
bool sorted):
data(data),
size(size),
sorted(sorted){}
llama_token_data * data; llama_token_data * data;
size_t size; size_t size;
bool sorted; bool sorted;
@ -138,7 +144,7 @@ extern "C" {
// - seq_id : the sequence to which the respective token belongs // - seq_id : the sequence to which the respective token belongs
// - logits : if zero, the logits for the respective token will not be output // - logits : if zero, the logits for the respective token will not be output
// //
typedef struct llama_batch { typedef struct llama_batch : refl::attr::usage::type{
int32_t n_tokens; int32_t n_tokens;
llama_token * token; llama_token * token;
@ -158,7 +164,7 @@ extern "C" {
llama_seq_id all_seq_id; // used if seq_id == NULL llama_seq_id all_seq_id; // used if seq_id == NULL
} llama_batch; } llama_batch;
struct llama_model_params { struct llama_model_params : refl::attr::usage::type{
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
@ -174,7 +180,7 @@ extern "C" {
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
}; };
struct llama_context_params { struct llama_context_params : refl::attr::usage::type{
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size uint32_t n_batch; // prompt processing maximum batch size
@ -199,7 +205,7 @@ extern "C" {
}; };
// model quantization parameters // model quantization parameters
typedef struct llama_model_quantize_params { typedef struct llama_model_quantize_params : refl::attr::usage::type{
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors bool allow_requantize; // allow quantizing non-f32/f16 tensors
@ -237,13 +243,13 @@ extern "C" {
LLAMA_GRETYPE_CHAR_ALT = 6, LLAMA_GRETYPE_CHAR_ALT = 6,
}; };
typedef struct llama_grammar_element { typedef struct llama_grammar_element : refl::attr::usage::type{
enum llama_gretype type; enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID uint32_t value; // Unicode code point or rule ID
} llama_grammar_element; } llama_grammar_element;
// performance timing information // performance timing information
struct llama_timings { struct llama_timings : refl::attr::usage::type{
double t_start_ms; double t_start_ms;
double t_end_ms; double t_end_ms;
double t_load_ms; double t_load_ms;
@ -720,7 +726,7 @@ extern "C" {
// Beam search // Beam search
// //
struct llama_beam_view { struct llama_beam_view : refl::attr::usage::type{
const llama_token * tokens; const llama_token * tokens;
size_t n_tokens; size_t n_tokens;
@ -732,7 +738,7 @@ extern "C" {
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks. // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
// These pointers are valid only during the synchronous callback, so should not be saved. // These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state { struct llama_beams_state : refl::attr::usage::type{
struct llama_beam_view * beam_views; struct llama_beam_view * beam_views;
size_t n_beams; // Number of elements in beam_views[]. size_t n_beams; // Number of elements in beam_views[].

View file

@ -1,3 +0,0 @@
#include "llama.h"
int main(void) {}