diff --git a/.editorconfig b/.editorconfig index fa84cb064..e092729bd 100644 --- a/.editorconfig +++ b/.editorconfig @@ -41,7 +41,7 @@ indent_style = tab trim_trailing_whitespace = unset insert_final_newline = unset -[{tests/chat/templates/*.jinja,tests/chat/goldens/*.txt}] +[tests/chat/templates/*.jinja] indent_style = unset indent_size = unset end_of_line = unset diff --git a/Makefile b/Makefile index 50dc14fa6..e9a093cbb 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,6 @@ BUILD_TARGETS = \ # Binaries only useful for tests TEST_TARGETS = \ - tests/test-antiprompts \ tests/test-arg-parser \ tests/test-autorelease \ tests/test-backend-ops \ @@ -1475,11 +1474,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-antiprompts: tests/test-antiprompts.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - tests/test-tool-call: tests/test-tool-call.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/common/common.h b/common/common.h index 75a189de6..964ea0351 100644 --- a/common/common.h +++ b/common/common.h @@ -4,12 +4,9 @@ #include "llama-cpp.h" -#include -#include #include #include #include -#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 821eb0b03..b112bfd6f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -504,13 +504,12 @@ int main(int argc, char ** argv) { std::vector embd; // single-token antiprompts - std::vector antiprompt_single_token; + std::vector antiprompt_token; - antiprompt_single_token.reserve(params.antiprompt.size()); for (const std::string & antiprompt : params.antiprompt) { auto ids = ::common_tokenize(ctx, antiprompt, false, true); if (ids.size() == 1) { - antiprompt_single_token.push_back(ids[0]); + antiprompt_token.push_back(ids[0]); } } @@ -756,7 +755,7 @@ int main(int argc, char ** argv) { // check for reverse prompt using special tokens llama_token last_token = common_sampler_last(smpl); - if (std::find(antiprompt_single_token.begin(), antiprompt_single_token.end(), last_token) != antiprompt_single_token.end()) { + if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) { if (params.interactive) { is_interacting = true; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 10e8a1bdb..97430941e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -168,6 +167,7 @@ struct slot_params { {"min_keep", sampling.min_keep}, {"grammar", sampling.grammar}, {"grammar_trigger_words", sampling.grammar_trigger_words}, + {"grammar_trigger_tokens", sampling.grammar_trigger_tokens}, {"samplers", samplers}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, @@ -386,6 +386,14 @@ struct server_task { return out; }; + { + params.antiprompt.clear(); + const auto stop = data.find("stop"); + if (stop != data.end()) { + params.antiprompt = to_string_vec(*stop); + } + } + { const auto grammar_trigger_words = data.find("grammar_trigger_words"); if (grammar_trigger_words != data.end()) { @@ -401,13 +409,6 @@ struct server_task { } } - { - const auto stop = data.find("stop"); - if (stop != data.end()) { - params.antiprompt = to_string_vec(*stop); - } - } - { const auto samplers = data.find("samplers"); if (samplers != data.end()) { @@ -730,7 +731,7 @@ struct server_task_result_cmpl_final : server_task_result { std::time_t t = std::time(0); - json res { + json res = json { {"choices", json::array({choice})}, {"created", t}, {"model", oaicompat_model}, @@ -762,13 +763,13 @@ struct server_task_result_cmpl_final : server_task_result { finish_reason = "stop"; } - json choice { + json choice = json { {"finish_reason", finish_reason}, {"index", 0}, {"delta", json::object()} }; - json ret { + json ret = json { {"choices", json::array({choice})}, {"created", t}, {"id", oaicompat_cmpl_id}, @@ -804,12 +805,10 @@ struct server_task_result_cmpl_partial : server_task_result { result_timings timings; // OAI-compat fields - bool verbose = false; - oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - json oaicompat_tools; - llama_tool_call_style oaicompat_tool_call_style = llama_tool_call_style::None; + bool verbose = false; + oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; virtual int get_index() override { return index; @@ -2048,9 +2047,6 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling const std::string token_str = result.text_to_send; - // TODO: - // const std::string token_str = result.text_to_send; - // const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special || (match.pos != std::string::npos && match.is_grammar_trigger)); slot.sampled = result.tok; slot.generated_text += token_str; @@ -2276,8 +2272,6 @@ struct server_context { res->oaicompat = slot.params.oaicompat; res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; - // res->oaicompat_tools = slot.params.oaicompat_tools; - // res->oaicompat_tool_call_style = slot.params.oaicompat_tool_call_style; // populate res.probs_output if (slot.params.sampling.n_probs > 0) { diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index e61d01b16..33fa8cc64 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -1,14 +1,14 @@ #!/bin/bash # make sure we are in the right directory -TESTS_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -cd $TESTS_DIR +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR set -eu if [[ "${SLOW_TESTS:-0}" == 1 ]]; then # Slow tests for tool calls need quite a few models ahead of time to avoid timing out. - python $TESTS_DIR/../../../scripts/fetch_server_test_models.py + python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py fi if [ $# -lt 1 ] diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 3c109109a..7641d3410 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -361,7 +361,6 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec std::string role = json_value(curr_msg, "role", std::string("")); std::string content; - if (curr_msg.contains("content")) { if (curr_msg["content"].is_string()) { content = curr_msg["content"].get(); @@ -611,29 +610,16 @@ static json oaicompat_completion_params_parse( llama_params["stop"] = json_value(body, "stop", json::array()); } - // Handle "response_format" field (https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format) + // Handle "response_format" field auto tool_choice = json_value(body, "tool_choice", std::string("auto")); if (body.contains("response_format")) { json response_format = json_value(body, "response_format", json::object()); std::string response_type = json_value(response_format, "type", std::string()); if (response_type == "json_object") { - // Legacy llama.cpp, llama-cpp-python and Together.ai format. llama_params["json_schema"] = json_value(response_format, "schema", json::object()); } else if (response_type == "json_schema") { - // OpenAI JSON schema format. auto json_schema = json_value(response_format, "json_schema", json::object()); - json schema = json_value(json_schema, "schema", json::object()); - std::string description = json_value(json_schema, "description", std::string()); - if (!description.empty()) { - if (schema.contains("description")) { - throw std::runtime_error("Cannot have both a description in the json_schema object and inside its schema."); - } - schema["description"] = description; - } - bool strict = json_value(json_schema, "strict", false); - if (strict) { - llama_params["json_schema"] = schema; - } + llama_params["json_schema"] = json_value(json_schema, "schema", json::object()); } else if (!response_type.empty() && response_type != "text") { throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); } diff --git a/include/llama.h b/include/llama.h index f6217d98c..b58e33e3c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1266,8 +1266,6 @@ extern "C" { // Returns the sampled token LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx); - LLAMA_API bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl); - // TODO: extend in the future //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...); diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 025e477f6..94de59d7e 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,3 @@ --r ../examples/agent/requirements.txt -r ../examples/llava/requirements.txt -r ../examples/server/bench/requirements.txt -r ../examples/server/tests/requirements.txt diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index b02c4e3cc..3dc593a48 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -1067,7 +1067,6 @@ struct llama_grammar * llama_grammar_init_impl( // then the pointers would be invalidated when the local vec_rules goes out of scope. return new llama_grammar { vocab, - std::move(vec_rules), std::move(stacks), /* .partial_utf8 = */ {}, diff --git a/src/llama-grammar.h b/src/llama-grammar.h index d96a685e2..38e7aff96 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -3,7 +3,6 @@ #include "llama.h" #include -#include #include #include @@ -116,6 +115,7 @@ struct llama_grammar { // buffer for partially generated UTF-8 sequence from accepted tokens llama_partial_utf8 partial_utf8; + // lazy grammars wait for trigger words or tokens before constraining the sampling. bool awaiting_trigger; std::string trigger_buffer; std::vector trigger_tokens; diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 387ec6567..129888915 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1536,10 +1536,10 @@ struct llama_sampler * llama_sampler_init_grammar( if (grammar_str != nullptr && grammar_str[0] != '\0') { *ctx = { - /* .vocab = */ vocab, - /* .grammar_str = */ grammar_str, - /* .grammar_root = */ grammar_root, - /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens), + /* .vocab = */ vocab, + /* .grammar_str = */ grammar_str, + /* .grammar_root = */ grammar_root, + /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens), }; } else { *ctx = { @@ -2423,11 +2423,6 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { return LLAMA_DEFAULT_SEED; } -bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl) { - struct llama_sampler_grammar * ctx = (struct llama_sampler_grammar *) smpl->ctx; - return ctx->grammar == nullptr; -} - // perf struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { diff --git a/tests/.gitignore b/tests/.gitignore index 6f6723930..620a48ee4 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,6 +1,4 @@ * -!chat/ -!chat/** !*.* *.o ggml-common.h