Reuse querybatch to reduce frequent memory allocation

This commit is contained in:
gtygo 2024-08-10 01:44:31 +08:00
parent fe6dc61143
commit 88105b7f12

View file

@ -253,6 +253,8 @@ int main(int argc, char ** argv) {
chunks[i].tokens.clear(); chunks[i].tokens.clear();
} }
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
// start loop, receive query and return top k similar chunks based on cosine similarity // start loop, receive query and return top k similar chunks based on cosine similarity
std::string query; std::string query;
while (true) { while (true) {
@ -260,13 +262,13 @@ int main(int argc, char ** argv) {
std::getline(std::cin, query); std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true); std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
batch_add_seq(query_batch, query_tokens, 0); batch_add_seq(query_batch, query_tokens, 0);
std::vector<float> query_emb(n_embd, 0); std::vector<float> query_emb(n_embd, 0);
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
llama_batch_free(query_batch);
llama_batch_clear(query_batch);
// compute cosine similarities // compute cosine similarities
{ {
@ -293,6 +295,7 @@ int main(int argc, char ** argv) {
} }
// clean up // clean up
llama_batch_free(query_batch);
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);