From 88105b7f128a894784485133c618685637313f20 Mon Sep 17 00:00:00 2001 From: gtygo Date: Sat, 10 Aug 2024 01:44:31 +0800 Subject: [PATCH] Reuse querybatch to reduce frequent memory allocation --- examples/retrieval/retrieval.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 8628f4c28..d1eccc00a 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -253,6 +253,8 @@ int main(int argc, char ** argv) { chunks[i].tokens.clear(); } + struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1); + // start loop, receive query and return top k similar chunks based on cosine similarity std::string query; while (true) { @@ -260,13 +262,13 @@ int main(int argc, char ** argv) { std::getline(std::cin, query); std::vector query_tokens = llama_tokenize(ctx, query, true); - struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1); batch_add_seq(query_batch, query_tokens, 0); std::vector query_emb(n_embd, 0); batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); - llama_batch_free(query_batch); + + llama_batch_clear(query_batch); // compute cosine similarities { @@ -293,6 +295,7 @@ int main(int argc, char ** argv) { } // clean up + llama_batch_free(query_batch); llama_print_timings(ctx); llama_free(ctx); llama_free_model(model);