diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8fe5e0b19..c71c9199d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1303,6 +1303,7 @@ struct llama_server_context bool ingest_images(server_slot &slot, int n_batch) { int image_idx = 0; + std::string prompt = ""; while (image_idx < (int) slot.images.size()) { @@ -1366,6 +1367,10 @@ struct llama_server_context slot.params.input_suffix : // no more images, then process suffix prompt (json)(slot.images[image_idx].prefix_prompt); + // rebuild the prompt since it was cleared earlier + prompt += img.prefix_prompt; + prompt += json_prompt; + std::vector append_tokens = tokenize(json_prompt, false); // has next image for (int i = 0; i < (int) append_tokens.size(); ++i) { @@ -1374,6 +1379,13 @@ struct llama_server_context } } + // There is no prompt caching in multimodal currently + slot.n_prompt_tokens = slot.n_past; + slot.n_prompt_tokens_processed = slot.n_past; + + // prompt for multimodal is set to empty to avoid processing those tokens here + slot.prompt = prompt; + return true; }