minimize diffs
This commit is contained in:
parent
dbf841b0d2
commit
ef61a4c79e
13 changed files with 30 additions and 71 deletions
|
@ -41,7 +41,7 @@ indent_style = tab
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
[{tests/chat/templates/*.jinja,tests/chat/goldens/*.txt}]
|
[tests/chat/templates/*.jinja]
|
||||||
indent_style = unset
|
indent_style = unset
|
||||||
indent_size = unset
|
indent_size = unset
|
||||||
end_of_line = unset
|
end_of_line = unset
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -49,7 +49,6 @@ BUILD_TARGETS = \
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
tests/test-antiprompts \
|
|
||||||
tests/test-arg-parser \
|
tests/test-arg-parser \
|
||||||
tests/test-autorelease \
|
tests/test-autorelease \
|
||||||
tests/test-backend-ops \
|
tests/test-backend-ops \
|
||||||
|
@ -1475,11 +1474,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-antiprompts: tests/test-antiprompts.cpp \
|
|
||||||
$(OBJ_ALL)
|
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
tests/test-tool-call: tests/test-tool-call.cpp \
|
tests/test-tool-call: tests/test-tool-call.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
|
|
@ -4,12 +4,9 @@
|
||||||
|
|
||||||
#include "llama-cpp.h"
|
#include "llama-cpp.h"
|
||||||
|
|
||||||
#include <functional>
|
|
||||||
#include <queue>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
|
|
|
@ -504,13 +504,12 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
|
|
||||||
// single-token antiprompts
|
// single-token antiprompts
|
||||||
std::vector<llama_token> antiprompt_single_token;
|
std::vector<llama_token> antiprompt_token;
|
||||||
|
|
||||||
antiprompt_single_token.reserve(params.antiprompt.size());
|
|
||||||
for (const std::string & antiprompt : params.antiprompt) {
|
for (const std::string & antiprompt : params.antiprompt) {
|
||||||
auto ids = ::common_tokenize(ctx, antiprompt, false, true);
|
auto ids = ::common_tokenize(ctx, antiprompt, false, true);
|
||||||
if (ids.size() == 1) {
|
if (ids.size() == 1) {
|
||||||
antiprompt_single_token.push_back(ids[0]);
|
antiprompt_token.push_back(ids[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -756,7 +755,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// check for reverse prompt using special tokens
|
// check for reverse prompt using special tokens
|
||||||
llama_token last_token = common_sampler_last(smpl);
|
llama_token last_token = common_sampler_last(smpl);
|
||||||
if (std::find(antiprompt_single_token.begin(), antiprompt_single_token.end(), last_token) != antiprompt_single_token.end()) {
|
if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,6 @@
|
||||||
#include <deque>
|
#include <deque>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <optional>
|
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
@ -168,6 +167,7 @@ struct slot_params {
|
||||||
{"min_keep", sampling.min_keep},
|
{"min_keep", sampling.min_keep},
|
||||||
{"grammar", sampling.grammar},
|
{"grammar", sampling.grammar},
|
||||||
{"grammar_trigger_words", sampling.grammar_trigger_words},
|
{"grammar_trigger_words", sampling.grammar_trigger_words},
|
||||||
|
{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
|
||||||
{"samplers", samplers},
|
{"samplers", samplers},
|
||||||
{"speculative.n_max", speculative.n_max},
|
{"speculative.n_max", speculative.n_max},
|
||||||
{"speculative.n_min", speculative.n_min},
|
{"speculative.n_min", speculative.n_min},
|
||||||
|
@ -386,6 +386,14 @@ struct server_task {
|
||||||
return out;
|
return out;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
{
|
||||||
|
params.antiprompt.clear();
|
||||||
|
const auto stop = data.find("stop");
|
||||||
|
if (stop != data.end()) {
|
||||||
|
params.antiprompt = to_string_vec(*stop);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const auto grammar_trigger_words = data.find("grammar_trigger_words");
|
const auto grammar_trigger_words = data.find("grammar_trigger_words");
|
||||||
if (grammar_trigger_words != data.end()) {
|
if (grammar_trigger_words != data.end()) {
|
||||||
|
@ -401,13 +409,6 @@ struct server_task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
|
||||||
const auto stop = data.find("stop");
|
|
||||||
if (stop != data.end()) {
|
|
||||||
params.antiprompt = to_string_vec(*stop);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
const auto samplers = data.find("samplers");
|
const auto samplers = data.find("samplers");
|
||||||
if (samplers != data.end()) {
|
if (samplers != data.end()) {
|
||||||
|
@ -730,7 +731,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||||
|
|
||||||
std::time_t t = std::time(0);
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
json res {
|
json res = json {
|
||||||
{"choices", json::array({choice})},
|
{"choices", json::array({choice})},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
|
@ -762,13 +763,13 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||||
finish_reason = "stop";
|
finish_reason = "stop";
|
||||||
}
|
}
|
||||||
|
|
||||||
json choice {
|
json choice = json {
|
||||||
{"finish_reason", finish_reason},
|
{"finish_reason", finish_reason},
|
||||||
{"index", 0},
|
{"index", 0},
|
||||||
{"delta", json::object()}
|
{"delta", json::object()}
|
||||||
};
|
};
|
||||||
|
|
||||||
json ret {
|
json ret = json {
|
||||||
{"choices", json::array({choice})},
|
{"choices", json::array({choice})},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
{"id", oaicompat_cmpl_id},
|
{"id", oaicompat_cmpl_id},
|
||||||
|
@ -808,8 +809,6 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||||
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||||
std::string oaicompat_model;
|
std::string oaicompat_model;
|
||||||
std::string oaicompat_cmpl_id;
|
std::string oaicompat_cmpl_id;
|
||||||
json oaicompat_tools;
|
|
||||||
llama_tool_call_style oaicompat_tool_call_style = llama_tool_call_style::None;
|
|
||||||
|
|
||||||
virtual int get_index() override {
|
virtual int get_index() override {
|
||||||
return index;
|
return index;
|
||||||
|
@ -2048,9 +2047,6 @@ struct server_context {
|
||||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
const std::string token_str = result.text_to_send;
|
const std::string token_str = result.text_to_send;
|
||||||
// TODO:
|
|
||||||
// const std::string token_str = result.text_to_send;
|
|
||||||
// const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special || (match.pos != std::string::npos && match.is_grammar_trigger));
|
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
slot.generated_text += token_str;
|
slot.generated_text += token_str;
|
||||||
|
@ -2276,8 +2272,6 @@ struct server_context {
|
||||||
res->oaicompat = slot.params.oaicompat;
|
res->oaicompat = slot.params.oaicompat;
|
||||||
res->oaicompat_model = slot.params.oaicompat_model;
|
res->oaicompat_model = slot.params.oaicompat_model;
|
||||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||||
// res->oaicompat_tools = slot.params.oaicompat_tools;
|
|
||||||
// res->oaicompat_tool_call_style = slot.params.oaicompat_tool_call_style;
|
|
||||||
|
|
||||||
// populate res.probs_output
|
// populate res.probs_output
|
||||||
if (slot.params.sampling.n_probs > 0) {
|
if (slot.params.sampling.n_probs > 0) {
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# make sure we are in the right directory
|
# make sure we are in the right directory
|
||||||
TESTS_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
cd $TESTS_DIR
|
cd $SCRIPT_DIR
|
||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
|
if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
|
||||||
# Slow tests for tool calls need quite a few models ahead of time to avoid timing out.
|
# Slow tests for tool calls need quite a few models ahead of time to avoid timing out.
|
||||||
python $TESTS_DIR/../../../scripts/fetch_server_test_models.py
|
python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $# -lt 1 ]
|
if [ $# -lt 1 ]
|
||||||
|
|
|
@ -361,7 +361,6 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
|
||||||
std::string role = json_value(curr_msg, "role", std::string(""));
|
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||||
|
|
||||||
std::string content;
|
std::string content;
|
||||||
|
|
||||||
if (curr_msg.contains("content")) {
|
if (curr_msg.contains("content")) {
|
||||||
if (curr_msg["content"].is_string()) {
|
if (curr_msg["content"].is_string()) {
|
||||||
content = curr_msg["content"].get<std::string>();
|
content = curr_msg["content"].get<std::string>();
|
||||||
|
@ -611,29 +610,16 @@ static json oaicompat_completion_params_parse(
|
||||||
llama_params["stop"] = json_value(body, "stop", json::array());
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle "response_format" field (https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format)
|
// Handle "response_format" field
|
||||||
auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
|
auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
|
||||||
if (body.contains("response_format")) {
|
if (body.contains("response_format")) {
|
||||||
json response_format = json_value(body, "response_format", json::object());
|
json response_format = json_value(body, "response_format", json::object());
|
||||||
std::string response_type = json_value(response_format, "type", std::string());
|
std::string response_type = json_value(response_format, "type", std::string());
|
||||||
if (response_type == "json_object") {
|
if (response_type == "json_object") {
|
||||||
// Legacy llama.cpp, llama-cpp-python and Together.ai format.
|
|
||||||
llama_params["json_schema"] = json_value(response_format, "schema", json::object());
|
llama_params["json_schema"] = json_value(response_format, "schema", json::object());
|
||||||
} else if (response_type == "json_schema") {
|
} else if (response_type == "json_schema") {
|
||||||
// OpenAI JSON schema format.
|
|
||||||
auto json_schema = json_value(response_format, "json_schema", json::object());
|
auto json_schema = json_value(response_format, "json_schema", json::object());
|
||||||
json schema = json_value(json_schema, "schema", json::object());
|
llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
|
||||||
std::string description = json_value(json_schema, "description", std::string());
|
|
||||||
if (!description.empty()) {
|
|
||||||
if (schema.contains("description")) {
|
|
||||||
throw std::runtime_error("Cannot have both a description in the json_schema object and inside its schema.");
|
|
||||||
}
|
|
||||||
schema["description"] = description;
|
|
||||||
}
|
|
||||||
bool strict = json_value(json_schema, "strict", false);
|
|
||||||
if (strict) {
|
|
||||||
llama_params["json_schema"] = schema;
|
|
||||||
}
|
|
||||||
} else if (!response_type.empty() && response_type != "text") {
|
} else if (!response_type.empty() && response_type != "text") {
|
||||||
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1266,8 +1266,6 @@ extern "C" {
|
||||||
// Returns the sampled token
|
// Returns the sampled token
|
||||||
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
||||||
|
|
||||||
LLAMA_API bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl);
|
|
||||||
|
|
||||||
// TODO: extend in the future
|
// TODO: extend in the future
|
||||||
//LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
|
//LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
-r ../examples/agent/requirements.txt
|
|
||||||
-r ../examples/llava/requirements.txt
|
-r ../examples/llava/requirements.txt
|
||||||
-r ../examples/server/bench/requirements.txt
|
-r ../examples/server/bench/requirements.txt
|
||||||
-r ../examples/server/tests/requirements.txt
|
-r ../examples/server/tests/requirements.txt
|
||||||
|
|
|
@ -1067,7 +1067,6 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||||
return new llama_grammar {
|
return new llama_grammar {
|
||||||
vocab,
|
vocab,
|
||||||
|
|
||||||
std::move(vec_rules),
|
std::move(vec_rules),
|
||||||
std::move(stacks),
|
std::move(stacks),
|
||||||
/* .partial_utf8 = */ {},
|
/* .partial_utf8 = */ {},
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -116,6 +115,7 @@ struct llama_grammar {
|
||||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||||
llama_partial_utf8 partial_utf8;
|
llama_partial_utf8 partial_utf8;
|
||||||
|
|
||||||
|
// lazy grammars wait for trigger words or tokens before constraining the sampling.
|
||||||
bool awaiting_trigger;
|
bool awaiting_trigger;
|
||||||
std::string trigger_buffer;
|
std::string trigger_buffer;
|
||||||
std::vector<llama_token> trigger_tokens;
|
std::vector<llama_token> trigger_tokens;
|
||||||
|
|
|
@ -2423,11 +2423,6 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
||||||
return LLAMA_DEFAULT_SEED;
|
return LLAMA_DEFAULT_SEED;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_sampler_is_grammar_empty(struct llama_sampler * smpl) {
|
|
||||||
struct llama_sampler_grammar * ctx = (struct llama_sampler_grammar *) smpl->ctx;
|
|
||||||
return ctx->grammar == nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// perf
|
// perf
|
||||||
|
|
||||||
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
|
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
|
||||||
|
|
2
tests/.gitignore
vendored
2
tests/.gitignore
vendored
|
@ -1,6 +1,4 @@
|
||||||
*
|
*
|
||||||
!chat/
|
|
||||||
!chat/**
|
|
||||||
!*.*
|
!*.*
|
||||||
*.o
|
*.o
|
||||||
ggml-common.h
|
ggml-common.h
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue