compiling and running

This commit is contained in:
mike dupont 2023-11-22 16:46:32 -05:00
parent 09a1f053e7
commit b598cf84fa
12 changed files with 158 additions and 103 deletions

View file

@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h tests/test-c.o: tests/test-c.cpp llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ $(CXX) $(CFLAGS) -c $(filter-out %.h,$^) -o $@

View file

@ -42,7 +42,7 @@ extern char const *LLAMA_BUILD_TARGET;
// //
int32_t get_num_physical_cores(); int32_t get_num_physical_cores();
struct gpt_params { struct gpt_params : refl::attr::usage::type{
uint32_t seed = -1; // RNG seed uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores(); int32_t n_threads = get_num_physical_cores();

View file

@ -219,7 +219,7 @@ namespace grammar_parser {
// in original rule, replace previous symbol with reference to generated rule // in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start); out_elements.resize(last_sym_start);
llama_grammar_element(LLAMA_GRETYPE_RULE_REF, sub_rule_id) a; llama_grammar_element a(LLAMA_GRETYPE_RULE_REF, sub_rule_id);
out_elements.push_back(a); out_elements.push_back(a);
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);

View file

@ -121,18 +121,18 @@ int main(int argc, char ** argv) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = { llama_batch batch_view(
.n_tokens=n_tokens, /* .n_tokens= */ n_tokens,
.token=batch.token + i, /* .token= */ batch.token + i,
.embd=nullptr, /* .embd= */ nullptr,
.pos=batch.pos + i, /* .pos= */ batch.pos + i,
.n_seq_id=batch.n_seq_id + i, /* .n_seq_id= */ batch.n_seq_id + i,
.seq_id=batch.seq_id + i, /* .seq_id= */ batch.seq_id + i,
.logits=batch.logits + i, /* .logits= */ batch.logits + i,
.all_pos_0=0, /* .all_pos_0= */0,
.all_pos_1=0, /* .all_pos_1= */0,
.all_seq_id=0, // unused /* .all_seq_id= */0 // unused
}; );
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) { if (ret != 0) {

View file

@ -169,13 +169,13 @@ int main(int argc, char ** argv) {
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ candidates.emplace_back(llama_token_data(
.id=token_id, token_id,
.logit=logits[token_id], logits[token_id],
.p=0.0f }); 0.0f ));
} }
llama_token_data_array candidates_p = { .data=candidates.data(), .size=candidates.size(), .sorted=false }; llama_token_data_array candidates_p (candidates.data(), candidates.size(), false );
const int top_k = 40; const int top_k = 40;
const float top_p = 0.9f; const float top_p = 0.9f;

View file

@ -75,18 +75,18 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) { if (n_eval > n_batch) {
n_eval = n_batch; n_eval = n_batch;
} }
llama_batch batch = { llama_batch batch(
.n_tokens=int32_t(n_eval), /* .n_tokens= */int32_t(n_eval),
.token=nullptr, /* .token= */nullptr,
.embd=(image_embed->embed+i*n_embd), /* .embd= */(image_embed->embed+i*n_embd),
.pos=nullptr, /* .pos= */nullptr,
.n_seq_id=nullptr, /* .n_seq_id= */nullptr,
.seq_id=nullptr, /* .seq_id= */nullptr,
.logits=nullptr, /* .logits= */nullptr,
.all_pos_0=*n_past, /* .all_pos_0= */*n_past,
.all_pos_1=1, /* .all_pos_1= */1,
.all_seq_id=0 /* .all_seq_id= */0
}; );
if (llama_decode(ctx_llama, batch)) { if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return false; return false;

View file

@ -99,11 +99,16 @@ static void sigint_handler(int signo) {
} }
} }
#endif #endif
using namespace refl;
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
using Td = type_descriptor<gpt_params>;
//constexpr auto tbl = descriptor::get_attribute<gpt_params>(Td{});
//constexpr auto tbl_name = REFL_MAKE_CONST_STRING(tbl.name);
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }

View file

@ -67,10 +67,10 @@ int main(int argc, char ** argv) {
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ candidates.emplace_back(llama_token_data(
.id=token_id, token_id,
.logit=logits[token_id], logits[token_id],
.p=0.0f}); 0.0f));
} }
llama_token_data_array candidates_p(candidates.data(), candidates.size(), false ); llama_token_data_array candidates_p(candidates.data(), candidates.size(), false );
auto next_token = llama_sample_token(ctx, &candidates_p); auto next_token = llama_sample_token(ctx, &candidates_p);

View file

@ -31,8 +31,16 @@
using json = nlohmann::json; using json = nlohmann::json;
struct server_params struct server_params : refl::attr::usage::type
{ {
server_params():
hostname( "127.0.0.1"),
public_path(public_path),
port(port),
read_timeout(read_timeout),
write_timeout( 600) {};
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
int32_t port = 8080; int32_t port = 8080;
@ -522,6 +530,28 @@ struct llama_server_context
std::vector<task_result> queue_results; std::vector<task_result> queue_results;
std::mutex mutex_tasks; std::mutex mutex_tasks;
std::mutex mutex_results; std::mutex mutex_results;
llama_server_context():
model(nullptr),
ctx(nullptr),
clp_ctx(nullptr),
params(params),
batch(batch),
multimodal(false),
clean_kv_cache( true),
all_slots_are_idle( false),
add_bos_token( true),
//int32_t id_gen;
//int32_t n_ctx; // total context for all clients / slots
system_need_update(false){}
//std::string system_prompt;
//std::vector<llama_token> system_tokens;
//std::string name_user; // this should be the antiprompt
//std::string name_assistant;
//std::vector<llama_client_slot> slots;
//std::vector<task_server> queue_tasks;
//std::vector<task_result> queue_results;
//std::mutex mutex_tasks;
//std::mutex mutex_results;
~llama_server_context() ~llama_server_context()
{ {
@ -1303,7 +1333,7 @@ struct llama_server_context
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{ {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = { llama_batch batch_view(
n_tokens, n_tokens,
batch.token + i, batch.token + i,
nullptr, nullptr,
@ -1311,8 +1341,8 @@ struct llama_server_context
batch.n_seq_id + i, batch.n_seq_id + i,
batch.seq_id + i, batch.seq_id + i,
batch.logits + i, batch.logits + i,
0, 0, 0, // unused 0, 0, 0 // unused
}; );
if (llama_decode(ctx, batch_view)) if (llama_decode(ctx, batch_view))
{ {
LOG_TEE("%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
@ -1665,19 +1695,18 @@ struct llama_server_context
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{ {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = llama_batch batch_view(
{ /* .n_tokens= */n_tokens,
.n_tokens=n_tokens, /* .token= */batch.token + i,
.token=batch.token + i, /* .embd= */nullptr,
.embd=nullptr, /* .pos= */batch.pos + i,
.pos=batch.pos + i, /* .n_seq_id= */batch.n_seq_id + i,
.n_seq_id=batch.n_seq_id + i, /* .seq_id= */batch.seq_id + i,
.seq_id=batch.seq_id + i, /* .logits= */batch.logits + i,
.logits=batch.logits + i, /* .all_pos_0= */.0,
.all_pos_0=.0, /* .all_pos_1= */0,
.all_pos_1=0, /* .all_seq_id= */0 // unused
.all_seq_id=0, // unused );
};
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) if (ret != 0)
@ -1724,10 +1753,10 @@ struct llama_server_context
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
} }
llama_token_data_array cur_p = { llama_token_data_array cur_p(
.data=slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.data(),
.size=slot.ctx_sampling->cur.size(), slot.ctx_sampling->cur.size(),
.sorted=false }; false );
result.tok = id; result.tok = id;
const int32_t n_probs = slot.sparams.n_probs; const int32_t n_probs = slot.sparams.n_probs;
@ -2596,4 +2625,4 @@ int main(int argc, char **argv)
llama_backend_free(); llama_backend_free();
return 0; return 0;
} }

View file

@ -124,15 +124,15 @@ int main(int argc, char ** argv) {
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ .id=token_id, candidates.emplace_back(llama_token_data( token_id,
.logit=logits[token_id], logits[token_id],
.p=0.0f }); 0.0f ));
} }
llama_token_data_array candidates_p = { llama_token_data_array candidates_p(
.data=candidates.data(), candidates.data(),
.size=candidates.size(), candidates.size(),
.sorted=false }; false );
// sample the most likely token // sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

View file

@ -9321,18 +9321,18 @@ int llama_eval_embd(
int n_past) { int n_past) {
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
llama_batch batch = { llama_batch batch(
.n_tokens=n_tokens, n_tokens,
.token=nullptr, nullptr,
.embd=embd, embd,
.pos=nullptr, nullptr,
.n_seq_id=nullptr, nullptr,
.seq_id=nullptr, nullptr,
.logits=nullptr, nullptr,
.all_pos_0=n_past, n_past,
.all_pos_1=1, 1,
.all_seq_id=0 0
}; );
const int ret = llama_decode_internal(*ctx, batch); const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) { if (ret < 0) {
@ -9352,34 +9352,32 @@ struct llama_batch llama_batch_get_one(
int32_t n_tokens, int32_t n_tokens,
llama_pos pos_0, llama_pos pos_0,
llama_seq_id seq_id) { llama_seq_id seq_id) {
llama_batch b ={ llama_batch b(
.n_tokens = n_tokens, n_tokens,
.token = tokens, tokens,
.embd = nullptr, nullptr,
.pos = nullptr, nullptr,
.n_seq_id = nullptr, nullptr,
.seq_id = nullptr, nullptr,
.logits = nullptr, nullptr,
.all_pos_0 = pos_0, pos_0,
.all_pos_1 = 1, 1,
.all_seq_id = seq_id, seq_id);
};
return b; return b;
} }
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) { struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
llama_batch batch = { llama_batch batch(
.n_tokens = 0, /* .n_tokens = */ 0,
.embd=nullptr, /* .token */ (llama_token *)nullptr,
.pos=nullptr, /* .embd= */ (float *)nullptr,
.n_seq_id=nullptr, /* .pos= */ (llama_pos *)nullptr,
.seq_id=nullptr, /* .n_seq_id= */ (int32_t *)nullptr,
.logits=nullptr, /* .seq_id= */ (llama_seq_id **)nullptr,
.all_pos_0=0, /* .logits= */ (int8_t *)nullptr,
.all_pos_1=0, /* .all_pos_0= */ 0,
.all_seq_id=0 /* .all_pos_1= */ 0 ,
/* .all_seq_id= */ 0);
};
if (embd) { if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd); batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);

25
llama.h
View file

@ -152,6 +152,29 @@ extern "C" {
// - logits : if zero, the logits for the respective token will not be output // - logits : if zero, the logits for the respective token will not be output
// //
typedef struct llama_batch : refl::attr::usage::type{ typedef struct llama_batch : refl::attr::usage::type{
llama_batch(int32_t n_tokens,
llama_token * token,
float * embd,
llama_pos * pos,
int32_t * n_seq_id,
llama_seq_id ** seq_id,
int8_t * logits,
llama_pos all_pos_0,
llama_pos all_pos_1,
llama_seq_id all_seq_id
) :
n_tokens(n_tokens),
token(token),
embd(embd),
pos(pos),
n_seq_id(n_seq_id),
seq_id(seq_id),
logits(logits),
all_pos_0(all_pos_0),
all_pos_1(all_pos_1),
all_seq_id(all_seq_id) {}
int32_t n_tokens; int32_t n_tokens;
llama_token * token; llama_token * token;
@ -254,7 +277,7 @@ extern "C" {
llama_grammar_element( enum llama_gretype type, llama_grammar_element( enum llama_gretype type,
uint32_t value // Unicode code point or rule ID uint32_t value // Unicode code point or rule ID
):type(type), value(value){} ):type(type), value(value){}
llama_grammar_element( ):type(0), value(0){} llama_grammar_element( ):type(llama_gretype(0)), value(0){}
enum llama_gretype type; enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID uint32_t value; // Unicode code point or rule ID
} llama_grammar_element; } llama_grammar_element;