fix num tokens for multimodal + empty prompt in response

2024-03-05 20:37:16 -08:00 · 2024-03-05 20:37:16 -08:00 · 5db4c71a16
commit 5db4c71a16
parent 652ca2bded
1 changed files with 12 additions and 0 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1303,6 +1303,7 @@ struct llama_server_context
    bool ingest_images(server_slot &slot, int n_batch)
    {
        int image_idx = 0;
        std::string prompt = "";
        while (image_idx < (int) slot.images.size())
        {
@ -1366,6 +1367,10 @@ struct llama_server_context
                slot.params.input_suffix : // no more images, then process suffix prompt
                (json)(slot.images[image_idx].prefix_prompt);
            // rebuild the prompt since it was cleared earlier
            prompt += img.prefix_prompt;
            prompt += json_prompt;
            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
            for (int i = 0; i < (int) append_tokens.size(); ++i)
            {
@ -1374,6 +1379,13 @@ struct llama_server_context
            }
        }
        // There is no prompt caching in multimodal currently
        slot.n_prompt_tokens = slot.n_past;
        slot.n_prompt_tokens_processed = slot.n_past;
        // prompt for multimodal is set to empty to avoid processing those tokens here
        slot.prompt = prompt;
        return true;
    }