Reuse querybatch to reduce frequent memory allocation
This commit is contained in:
parent
fe6dc61143
commit
88105b7f12
1 changed files with 5 additions and 2 deletions
|
@ -253,6 +253,8 @@ int main(int argc, char ** argv) {
|
||||||
chunks[i].tokens.clear();
|
chunks[i].tokens.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
|
||||||
|
|
||||||
// start loop, receive query and return top k similar chunks based on cosine similarity
|
// start loop, receive query and return top k similar chunks based on cosine similarity
|
||||||
std::string query;
|
std::string query;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -260,13 +262,13 @@ int main(int argc, char ** argv) {
|
||||||
std::getline(std::cin, query);
|
std::getline(std::cin, query);
|
||||||
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
||||||
|
|
||||||
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
|
|
||||||
batch_add_seq(query_batch, query_tokens, 0);
|
batch_add_seq(query_batch, query_tokens, 0);
|
||||||
|
|
||||||
std::vector<float> query_emb(n_embd, 0);
|
std::vector<float> query_emb(n_embd, 0);
|
||||||
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||||
|
|
||||||
llama_batch_free(query_batch);
|
|
||||||
|
llama_batch_clear(query_batch);
|
||||||
|
|
||||||
// compute cosine similarities
|
// compute cosine similarities
|
||||||
{
|
{
|
||||||
|
@ -293,6 +295,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
|
llama_batch_free(query_batch);
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue