examples : fix build after sampling refactoring

ggml-ci
This commit is contained in:
Georgi Gerganov 2023-10-15 23:28:41 +03:00
parent 4a7f43f28c
commit 7e48e21b1f
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
8 changed files with 144 additions and 232 deletions

View file

@ -51,6 +51,12 @@ static std::vector<std::string> k_prompts = {
};
struct client {
~client() {
if (ctx_sampling) {
llama_sampling_free(ctx_sampling);
}
}
int32_t id = 0;
llama_seq_id seq_id = -1;
@ -68,9 +74,7 @@ struct client {
std::string prompt;
std::string response;
std::vector<llama_token> tokens_prev;
llama_sampling_context ctx_sampling;
struct llama_sampling_context * ctx_sampling = nullptr;
};
static void print_date_time() {
@ -147,21 +151,15 @@ int main(int argc, char ** argv) {
fprintf(stderr, "\n\n");
fflush(stderr);
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(model);
const int n_ctx = llama_n_ctx(ctx);
std::vector<client> clients(n_clients);
for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i];
client.id = i;
client.tokens_prev.resize(std::max(256, params.n_predict));
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
client.ctx_sampling = llama_sampling_context_init(params, NULL);
client.ctx_sampling = llama_sampling_init(params);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
std::vector<llama_token> tokens_system;
tokens_system = ::llama_tokenize(ctx, k_system, true);
const int32_t n_tokens_system = tokens_system.size();
@ -253,7 +251,7 @@ int main(int argc, char ** argv) {
client.prompt = client.input + "\nAssistant:";
client.response = "";
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
llama_sampling_reset(client.ctx_sampling);
// do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt;
@ -262,7 +260,7 @@ int main(int argc, char ** argv) {
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
batch.token [batch.n_tokens] = tokens_prompt[i];
batch.pos [batch.n_tokens] = i + n_tokens_system;
batch.n_seq_id[batch.n_tokens] = client.id;
batch.n_seq_id[batch.n_tokens] = 1;
batch.seq_id [batch.n_tokens][0] = client.id;
batch.logits [batch.n_tokens] = false;
batch.n_tokens += 1;
@ -346,7 +344,9 @@ int main(int argc, char ** argv) {
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = llama_sampling_sample(ctx, NULL, client.ctx_sampling, client.tokens_prev, candidates, client.i_batch - i);
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
llama_sampling_accept(client.ctx_sampling, ctx, id);
if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients
@ -354,11 +354,8 @@ int main(int argc, char ** argv) {
client.t_start_gen = ggml_time_us();
}
// remember which tokens were sampled - used for repetition penalties during sampling
client.tokens_prev.erase(client.tokens_prev.begin());
client.tokens_prev.push_back(id);
const std::string token_str = llama_token_to_piece(ctx, id);
client.response += token_str;
client.sampled = id;