minimize diffs

2025-01-22 01:46:51 +00:00 · 2025-01-22 01:46:51 +00:00 · ef61a4c79e
commit ef61a4c79e
parent dbf841b0d2
13 changed files with 30 additions and 71 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -41,7 +41,7 @@ indent_style = tab
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[{tests/chat/templates/*.jinja,tests/chat/goldens/*.txt}]
+[tests/chat/templates/*.jinja]
 indent_style = unset
 indent_size = unset
 end_of_line = unset
--- a/6
+++ b/6
@ -49,7 +49,6 @@ BUILD_TARGETS = \

 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-antiprompts \
 	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
@ -1475,11 +1474,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-antiprompts: tests/test-antiprompts.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-tool-call: tests/test-tool-call.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
--- a/common/common.h
+++ b/common/common.h
@ -4,12 +4,9 @@

 #include "llama-cpp.h"

-#include <functional>
-#include <queue>
 #include <string>
 #include <vector>
 #include <sstream>
-#include <unordered_map>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -504,13 +504,12 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;

    // single-token antiprompts
-    std::vector<llama_token> antiprompt_single_token;
+    std::vector<llama_token> antiprompt_token;

-    antiprompt_single_token.reserve(params.antiprompt.size());
    for (const std::string & antiprompt : params.antiprompt) {
        auto ids = ::common_tokenize(ctx, antiprompt, false, true);
        if (ids.size() == 1) {
-            antiprompt_single_token.push_back(ids[0]);
+            antiprompt_token.push_back(ids[0]);
        }
    }

@ -756,7 +755,7 @@ int main(int argc, char ** argv) {

                // check for reverse prompt using special tokens
                llama_token last_token = common_sampler_last(smpl);
-                if (std::find(antiprompt_single_token.begin(), antiprompt_single_token.end(), last_token) != antiprompt_single_token.end()) {
+                if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
                    if (params.interactive) {
                        is_interacting = true;
                    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -26,7 +26,6 @@
 #include <deque>
 #include <memory>
 #include <mutex>
-#include <optional>
 #include <signal.h>
 #include <thread>
 #include <unordered_map>
@ -168,6 +167,7 @@ struct slot_params {
            {"min_keep",                  sampling.min_keep},
            {"grammar",                   sampling.grammar},
            {"grammar_trigger_words",     sampling.grammar_trigger_words},
+            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
            {"samplers",                  samplers},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
@ -386,6 +386,14 @@ struct server_task {
            return out;
        };

+        {
+            params.antiprompt.clear();
+            const auto stop = data.find("stop");
+            if (stop != data.end()) {
+                params.antiprompt = to_string_vec(*stop);
+            }
+        }
+
        {
            const auto grammar_trigger_words = data.find("grammar_trigger_words");
            if (grammar_trigger_words != data.end()) {
@ -401,13 +409,6 @@ struct server_task {
            }
        }

-        {
-            const auto stop = data.find("stop");
-            if (stop != data.end()) {
-                params.antiprompt = to_string_vec(*stop);
-            }
-        }
-
        {
            const auto samplers = data.find("samplers");
            if (samplers != data.end()) {
@ -730,7 +731,7 @@ struct server_task_result_cmpl_final : server_task_result {

        std::time_t t = std::time(0);

-        json res {
+        json res = json {
            {"choices",            json::array({choice})},
            {"created",            t},
            {"model",              oaicompat_model},
@ -762,13 +763,13 @@ struct server_task_result_cmpl_final : server_task_result {
            finish_reason = "stop";
        }

-        json choice {
+        json choice = json {
            {"finish_reason", finish_reason},
            {"index", 0},
            {"delta", json::object()}
        };

-        json ret {
+        json ret = json {
            {"choices",            json::array({choice})},
            {"created",            t},
            {"id",                 oaicompat_cmpl_id},
@ -804,12 +805,10 @@ struct server_task_result_cmpl_partial : server_task_result {
    result_timings timings;

    // OAI-compat fields
-    bool                  verbose                   = false;
-    oaicompat_type        oaicompat                 = OAICOMPAT_TYPE_NONE;
-    std::string           oaicompat_model;
-    std::string           oaicompat_cmpl_id;
-    json                  oaicompat_tools;
-    llama_tool_call_style oaicompat_tool_call_style = llama_tool_call_style::None;
+    bool           verbose   = false;
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
+    std::string    oaicompat_model;
+    std::string    oaicompat_cmpl_id;

    virtual int get_index() override {
        return index;
@ -2048,9 +2047,6 @@ struct server_context {
    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
        const std::string token_str = result.text_to_send;
-        // TODO:
-        // const std::string token_str = result.text_to_send;
-        // const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special || (match.pos != std::string::npos && match.is_grammar_trigger));
        slot.sampled = result.tok;

        slot.generated_text += token_str;
@ -2276,8 +2272,6 @@ struct server_context {
        res->oaicompat         = slot.params.oaicompat;
        res->oaicompat_model   = slot.params.oaicompat_model;
        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
-        // res->oaicompat_tools   = slot.params.oaicompat_tools;
-        // res->oaicompat_tool_call_style = slot.params.oaicompat_tool_call_style;

        // populate res.probs_output
        if (slot.params.sampling.n_probs > 0) {
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -1,14 +1,14 @@
 #!/bin/bash

 # make sure we are in the right directory
-TESTS_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-cd $TESTS_DIR
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR

 set -eu

 if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
    # Slow tests for tool calls need quite a few models ahead of time to avoid timing out.
-    python $TESTS_DIR/../../../scripts/fetch_server_test_models.py
+    python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py
 fi

 if [ $# -lt 1 ]
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -361,7 +361,6 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
        std::string role = json_value(curr_msg, "role", std::string(""));

        std::string content;
-
        if (curr_msg.contains("content")) {
            if (curr_msg["content"].is_string()) {
                content = curr_msg["content"].get<std::string>();
@ -611,29 +610,16 @@ static json oaicompat_completion_params_parse(
        llama_params["stop"] = json_value(body, "stop", json::array());
    }

-    // Handle "response_format" field (https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format)
+    // Handle "response_format" field
    auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
    if (body.contains("response_format")) {
        json response_format      = json_value(body, "response_format", json::object());
        std::string response_type = json_value(response_format, "type", std::string());
        if (response_type == "json_object") {
-            // Legacy llama.cpp, llama-cpp-python and Together.ai format.
            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
        } else if (response_type == "json_schema") {
-            // OpenAI JSON schema format.
            auto json_schema = json_value(response_format, "json_schema", json::object());
-            json schema = json_value(json_schema, "schema", json::object());
-            std::string description = json_value(json_schema, "description", std::string());
-            if (!description.empty()) {
-                if (schema.contains("description")) {
-                    throw std::runtime_error("Cannot have both a description in the json_schema object and inside its schema.");
-                }
-                schema["description"] = description;
-            }
-            bool strict = json_value(json_schema, "strict", false);
-            if (strict) {
-                llama_params["json_schema"] = schema;
-            }
+            llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
        } else if (!response_type.empty() && response_type != "text") {
            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
        }
--- a/include/llama.h
+++ b/include/llama.h
@ -1266,8 +1266,6 @@ extern "C" {
    // Returns the sampled token
    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);

-    LLAMA_API bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl);
-
    // TODO: extend in the future
    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);

--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@ -1,4 +1,3 @@
-r ../examples/agent/requirements.txt
 -r ../examples/llava/requirements.txt
 -r ../examples/server/bench/requirements.txt
 -r ../examples/server/tests/requirements.txt
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -1067,7 +1067,6 @@ struct llama_grammar * llama_grammar_init_impl(
    // then the pointers would be invalidated when the local vec_rules goes out of scope.
    return new llama_grammar {
        vocab,
-        
        std::move(vec_rules),
        std::move(stacks),
        /* .partial_utf8 = */ {},
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@ -3,7 +3,6 @@
 #include "llama.h"

 #include <map>
-#include <set>
 #include <string>
 #include <vector>

@ -116,6 +115,7 @@ struct llama_grammar {
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8 partial_utf8;

+    // lazy grammars wait for trigger words or tokens before constraining the sampling.
    bool                     awaiting_trigger;
    std::string              trigger_buffer;
    std::vector<llama_token> trigger_tokens;
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -1536,10 +1536,10 @@ struct llama_sampler * llama_sampler_init_grammar(

    if (grammar_str != nullptr && grammar_str[0] != '\0') {
        *ctx = {
-            /* .vocab        = */     vocab,
-            /* .grammar_str  = */     grammar_str,
-            /* .grammar_root = */     grammar_root,
-            /* .grammar      = */     llama_grammar_init_impl(vocab, grammar_str, grammar_root, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
+            /* .vocab        = */ vocab,
+            /* .grammar_str  = */ grammar_str,
+            /* .grammar_root = */ grammar_root,
+            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
        };
    } else {
        *ctx = {
@ -2423,11 +2423,6 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
    return LLAMA_DEFAULT_SEED;
 }

-bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl) {
-    struct llama_sampler_grammar * ctx = (struct llama_sampler_grammar *) smpl->ctx;
-    return ctx->grammar == nullptr;
-}
-
 // perf

 struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -1,6 +1,4 @@
 *
-!chat/
-!chat/**
 !*.*
 *.o
 ggml-common.h