fix num tokens for multimodal + empty prompt in response

This commit is contained in:
CJ Pais 2024-03-05 20:37:16 -08:00
parent 652ca2bded
commit 5db4c71a16

View file

@ -1303,6 +1303,7 @@ struct llama_server_context
bool ingest_images(server_slot &slot, int n_batch) bool ingest_images(server_slot &slot, int n_batch)
{ {
int image_idx = 0; int image_idx = 0;
std::string prompt = "";
while (image_idx < (int) slot.images.size()) while (image_idx < (int) slot.images.size())
{ {
@ -1366,6 +1367,10 @@ struct llama_server_context
slot.params.input_suffix : // no more images, then process suffix prompt slot.params.input_suffix : // no more images, then process suffix prompt
(json)(slot.images[image_idx].prefix_prompt); (json)(slot.images[image_idx].prefix_prompt);
// rebuild the prompt since it was cleared earlier
prompt += img.prefix_prompt;
prompt += json_prompt;
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i) for (int i = 0; i < (int) append_tokens.size(); ++i)
{ {
@ -1374,6 +1379,13 @@ struct llama_server_context
} }
} }
// There is no prompt caching in multimodal currently
slot.n_prompt_tokens = slot.n_past;
slot.n_prompt_tokens_processed = slot.n_past;
// prompt for multimodal is set to empty to avoid processing those tokens here
slot.prompt = prompt;
return true; return true;
} }