Merge remote-tracking branch 'origin/master' into server_branch

This commit is contained in:
pudepiedj 2024-03-08 12:31:31 +00:00
commit f44aa1f23a
17 changed files with 2416 additions and 2610 deletions

View file

@ -58,7 +58,8 @@ jobs:
cmake \ cmake \
python3-pip \ python3-pip \
wget \ wget \
psmisc psmisc \
language-pack-en
- name: Build - name: Build
id: cmake_build id: cmake_build

View file

@ -724,10 +724,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

View file

@ -309,7 +309,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER #ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "") #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else #else
#define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "") #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif #endif
// Main TEE macro. // Main TEE macro.
@ -323,7 +323,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER #ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else #else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "") #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif #endif
// LOG macro variants with auto endline. // LOG macro variants with auto endline.
@ -331,8 +331,8 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else #else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n") #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n") #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#endif #endif
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE

View file

@ -173,6 +173,7 @@ struct cmd_params {
std::vector<bool> no_kv_offload; std::vector<bool> no_kv_offload;
std::vector<std::vector<float>> tensor_split; std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap; std::vector<bool> use_mmap;
std::vector<bool> embeddings;
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
@ -192,6 +193,7 @@ static const cmd_params cmd_params_defaults = {
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)}, /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true}, /* use_mmap */ {true},
/* embeddings */ {false},
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN
@ -214,6 +216,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
@ -382,6 +385,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<bool>(argv[i], split_delim); auto p = split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") { } else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -453,6 +463,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params; return params;
@ -472,6 +483,7 @@ struct cmd_params_instance {
bool no_kv_offload; bool no_kv_offload;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap; bool use_mmap;
bool embeddings;
llama_model_params to_llama_mparams() const { llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
@ -502,6 +514,7 @@ struct cmd_params_instance {
cparams.type_k = type_k; cparams.type_k = type_k;
cparams.type_v = type_v; cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload; cparams.offload_kqv = !no_kv_offload;
cparams.embeddings = embeddings;
return cparams; return cparams;
} }
@ -517,6 +530,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & mg : params.main_gpu) for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split) for (const auto & ts : params.tensor_split)
for (const auto & mmp : params.use_mmap) for (const auto & mmp : params.use_mmap)
for (const auto & embd : params.embeddings)
for (const auto & nb : params.n_batch) for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
@ -540,6 +554,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp, /* .use_mmap = */ mmp,
/* .embeddings = */ embd,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -562,6 +577,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp, /* .use_mmap = */ mmp,
/* .embeddings = */ embd,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -597,6 +613,7 @@ struct test {
bool no_kv_offload; bool no_kv_offload;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap; bool use_mmap;
bool embeddings;
int n_prompt; int n_prompt;
int n_gen; int n_gen;
std::string test_time; std::string test_time;
@ -619,6 +636,7 @@ struct test {
no_kv_offload = inst.no_kv_offload; no_kv_offload = inst.no_kv_offload;
tensor_split = inst.tensor_split; tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap; use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
n_prompt = inst.n_prompt; n_prompt = inst.n_prompt;
n_gen = inst.n_gen; n_gen = inst.n_gen;
// RFC 3339 date-time format // RFC 3339 date-time format
@ -690,7 +708,7 @@ struct test {
"n_batch", "n_threads", "type_k", "type_v", "n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "main_gpu", "no_kv_offload",
"tensor_split", "use_mmap", "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts" "avg_ts", "stddev_ts"
@ -710,7 +728,7 @@ struct test {
} }
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "use_mmap") { field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts") { if (field == "avg_ts" || field == "stddev_ts") {
@ -744,7 +762,7 @@ struct test {
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(main_gpu), std::to_string(no_kv_offload),
tensor_split_str, std::to_string(use_mmap), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts()) std::to_string(avg_ts()), std::to_string(stdev_ts())
@ -914,6 +932,9 @@ struct markdown_printer : public printer {
if (field == "use_mmap") { if (field == "use_mmap") {
return "mmap"; return "mmap";
} }
if (field == "embeddings") {
return "embd";
}
if (field == "tensor_split") { if (field == "tensor_split") {
return "ts"; return "ts";
} }
@ -957,6 +978,9 @@ struct markdown_printer : public printer {
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.emplace_back("use_mmap"); fields.emplace_back("use_mmap");
} }
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
fields.emplace_back("embeddings");
}
fields.emplace_back("test"); fields.emplace_back("test");
fields.emplace_back("t/s"); fields.emplace_back("t/s");

View file

@ -13,7 +13,7 @@ async def main():
model_url = "http://127.0.0.1:6900" model_url = "http://127.0.0.1:6900"
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async( responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
url= f"{model_url}/embedding", url= f"{model_url}/embedding",
json= {"content": str(i)*1024} json= {"content": str(0)*1024}
) for i in range(n)]) ) for i in range(n)])
for response in responses: for response in responses:

View file

@ -1,13 +1,12 @@
set(TARGET server) set(TARGET server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address") add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
if (WIN32) if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif() endif()

View file

@ -452,7 +452,7 @@ Notice that each `probs` is an array of length `n_probs`.
"next_token": { "next_token": {
"has_next_token": true, "has_next_token": true,
"n_remain": -1, "n_remain": -1,
"num_tokens_predicted": 0, "n_decoded": 0,
"stopped_eos": false, "stopped_eos": false,
"stopped_limit": false, "stopped_limit": false,
"stopped_word": false, "stopped_word": false,

View file

@ -1,225 +0,0 @@
#pragma once
#include <string>
#include <vector>
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include "json.hpp"
#include "utils.hpp"
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json;
inline static json oaicompat_completion_params_parse(
const struct llama_model * model,
const json &body, /* openai api json semantics */
const std::string &chat_template)
{
json llama_params;
llama_params["__oaicompat"] = true;
// Map OpenAI parameters to llama.cpp parameters
//
// For parameters that are defined by the OpenAI documentation (e.g.
// temperature), we explicitly specify OpenAI's intended default; we
// need to do that because sometimes OpenAI disagrees with llama.cpp
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
llama_params["top_p"] = json_value(body, "top_p", 1.0);
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
if (body.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object());
}
// Handle 'stop' field
if (body.contains("stop") && body["stop"].is_string()) {
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
} else {
llama_params["stop"] = json_value(body, "stop", json::array());
}
// Ensure there is ChatML-specific end sequence among stop words
llama_params["stop"].push_back("<|im_end|>");
return llama_params;
}
inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
{
json result = response.result_json;
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason = "length";
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
json choices =
streaming ? json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}})
: json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"message", json{{"content", content},
{"role", "assistant"}}}}});
std::time_t t = std::time(0);
json res =
json{{"choices", choices},
{"created", t},
{"model",
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage",
json{{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
{"id", gen_chatcmplid()}};
if (server_verbose) {
res["__verbose"] = result;
}
if (result.contains("completion_probabilities")) {
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
return res;
}
// return value is vector as there is one case where we might need to generate two responses
inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
json result = response.result_json;
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({response.result_json});
}
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
bool stopped_word = json_value(result, "stopped_word", false);
bool stopped_eos = json_value(result, "stopped_eos", false);
bool stopped_limit = json_value(result, "stopped_limit", false);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason;
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
if (stopped_limit) {
finish_reason = "length";
}
std::time_t t = std::time(0);
json choices;
if (!finish_reason.empty()) {
choices = json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}});
} else {
if (first) {
if (content.empty()) {
choices = json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{{"role", "assistant"}}}}});
} else {
// We have to send this as two updates to conform to openai behavior
json initial_ret = json{{"choices", json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"role", "assistant"}
}}}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"content", content}}}
}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({initial_ret, second_ret});
}
} else {
// Some idiosyncrasy in task processing logic makes several trailing calls
// with empty content, we ignore these at the calee site.
if (content.empty()) {
return std::vector<json>({json::object()});
}
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta",
json{
{"content", content},
}},
}});
}
}
json ret = json{{"choices", choices},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({ret});
}
inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
{
json res =
json{
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"},
{"usage",
json{{"prompt_tokens", 0},
{"total_tokens", 0}}},
{"data", embeddings}
};
return res;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,94 @@
@llama.cpp
@embeddings
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
And a model alias bert-bge-small
And 42 as server seed
And 2 slots
And 1024 as batch size
And 2048 KV cache size
And embeddings extraction
Then the server is starting
Then the server is healthy
Scenario: Embedding
When embeddings are computed for:
"""
What is the capital of Bulgaria ?
"""
Then embeddings are generated
Scenario: OAI Embeddings compatibility
Given a model bert-bge-small
When an OAI compatible embeddings computation request for:
"""
What is the capital of Spain ?
"""
Then embeddings are generated
Scenario: OAI Embeddings compatibility with multiple inputs
Given a model bert-bge-small
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
When an OAI compatible embeddings computation request for multiple inputs
Then embeddings are generated
Scenario: Multi users embeddings
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And a prompt:
"""
Write a very long poem.
"""
And a prompt:
"""
Write a very long joke.
"""
Given concurrent embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated
Scenario: Multi users OAI compatibility embeddings
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
And a prompt:
"""
What is the biggest US city ?
"""
And a prompt:
"""
What is the capital of Bulgaria ?
"""
And a model bert-bge-small
Given concurrent OAI embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated
Scenario: All embeddings should be the same
Given 10 fixed prompts
And a model bert-bge-small
Given concurrent OAI embedding requests
Then all embeddings are the same

View file

@ -9,7 +9,6 @@ Feature: Parallel
And 512 as batch size And 512 as batch size
And 64 KV cache size And 64 KV cache size
And 2 slots And 2 slots
And embeddings extraction
And continuous batching And continuous batching
Then the server is starting Then the server is starting
Then the server is healthy Then the server is healthy
@ -99,48 +98,3 @@ Feature: Parallel
Then the server is busy Then the server is busy
Then the server is idle Then the server is idle
Then all prompts are predicted Then all prompts are predicted
Scenario: Multi users embeddings
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And a prompt:
"""
Write a very long poem.
"""
And a prompt:
"""
Write a very long joke.
"""
Given concurrent embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated
Scenario: Multi users OAI compatibility embeddings
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
And a prompt:
"""
What is the biggest US city ?
"""
And a prompt:
"""
What is the capital of Bulgaria ?
"""
And a model tinyllama-2
Given concurrent OAI embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated

View file

@ -29,6 +29,7 @@ Feature: llama.cpp server
And a completion request with no api error And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content> Then <n_predicted> tokens are predicted matching <re_content>
And prometheus metrics are exposed And prometheus metrics are exposed
And metric llamacpp:tokens_predicted is <n_predicted>
Examples: Prompts Examples: Prompts
| prompt | n_predict | re_content | n_predicted | | prompt | n_predict | re_content | n_predicted |
@ -49,34 +50,6 @@ Feature: llama.cpp server
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled | | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled | | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
Scenario: Embedding
When embeddings are computed for:
"""
What is the capital of Bulgaria ?
"""
Then embeddings are generated
Scenario: OAI Embeddings compatibility
Given a model tinyllama-2
When an OAI compatible embeddings computation request for:
"""
What is the capital of Spain ?
"""
Then embeddings are generated
Scenario: OAI Embeddings compatibility with multiple inputs
Given a model tinyllama-2
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
When an OAI compatible embeddings computation request for multiple inputs
Then embeddings are generated
Scenario: Tokenize / Detokenize Scenario: Tokenize / Detokenize
When tokenizing: When tokenizing:
""" """

View file

@ -10,6 +10,7 @@ from contextlib import closing
from re import RegexFlag from re import RegexFlag
import aiohttp import aiohttp
import numpy as np
import openai import openai
from behave import step from behave import step
from behave.api.async_step import async_run_until_complete from behave.api.async_step import async_run_until_complete
@ -24,6 +25,9 @@ def step_server_config(context, server_fqdn, server_port):
if 'PORT' in os.environ: if 'PORT' in os.environ:
context.server_port = int(os.environ['PORT']) context.server_port = int(os.environ['PORT'])
print(f"$PORT set, overriding server port with to {context.server_port}") print(f"$PORT set, overriding server port with to {context.server_port}")
if 'FQDN' in os.environ:
context.server_fqdn = os.environ['FQDN']
print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
context.base_url = f'http://{context.server_fqdn}:{context.server_port}' context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
@ -34,6 +38,7 @@ def step_server_config(context, server_fqdn, server_port):
context.n_ga_w = None context.n_ga_w = None
context.n_gpu_layer = None context.n_gpu_layer = None
context.n_predict = None context.n_predict = None
context.n_prompts = 0
context.n_server_predict = None context.n_server_predict = None
context.n_slots = None context.n_slots = None
context.prompt_prefix = None context.prompt_prefix = None
@ -202,6 +207,7 @@ def step_n_tokens_predicted(context, predicted_n):
@step(u'a user prompt {user_prompt}') @step(u'a user prompt {user_prompt}')
def step_user_prompt(context, user_prompt): def step_user_prompt(context, user_prompt):
context.prompts.append(user_prompt) context.prompts.append(user_prompt)
context.n_prompts = len(context.prompts)
@step(u'a system prompt {system_prompt}') @step(u'a system prompt {system_prompt}')
@ -290,6 +296,12 @@ def step_prompt_passkey(context):
context.prompt_passkey = context.text context.prompt_passkey = context.text
@step(u'{n_prompts:d} fixed prompts')
def step_fixed_prompts(context, n_prompts):
context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
context.n_prompts = n_prompts
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') @step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
def step_prompt_passkey(context, passkey, i_pos): def step_prompt_passkey(context, passkey, i_pos):
prompt = "" prompt = ""
@ -301,6 +313,7 @@ def step_prompt_passkey(context, passkey, i_pos):
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m" passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n") print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix) context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
context.n_prompts = len(context.prompts)
@step(u'an OAI compatible chat completions request with {api_error} api error') @step(u'an OAI compatible chat completions request with {api_error} api error')
@ -341,11 +354,13 @@ async def step_oai_chat_completions(context, api_error):
@step(u'a prompt') @step(u'a prompt')
def step_a_prompt(context): def step_a_prompt(context):
context.prompts.append(context.text) context.prompts.append(context.text)
context.n_prompts = len(context.prompts)
@step(u'a prompt {prompt}') @step(u'a prompt {prompt}')
def step_a_prompt_prompt(context, prompt): def step_a_prompt_prompt(context, prompt):
context.prompts.append(prompt) context.prompts.append(prompt)
context.n_prompts = len(context.prompts)
@step(u'concurrent completion requests') @step(u'concurrent completion requests')
@ -430,25 +445,47 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
@step(u'embeddings are computed for') @step(u'embeddings are computed for')
@async_run_until_complete @async_run_until_complete
async def step_compute_embedding(context): async def step_compute_embedding(context):
context.n_prompts = 1
context.embeddings = await request_embedding(context.text, base_url=context.base_url) context.embeddings = await request_embedding(context.text, base_url=context.base_url)
@step(u'all embeddings are the same')
@async_run_until_complete
async def step_all_embeddings_are_the_same(context):
n_embedding_requests = await gather_tasks_results(context)
assert n_embedding_requests > 0
embeddings = []
for i in range(n_embedding_requests):
embedding = context.tasks_result.pop().pop()
embeddings.append(embedding)
assert_embeddings(embedding)
n = len(embeddings)
for i in range(n-1):
for j in range(i+1, n):
embedding1 = np.array(embeddings[i])
embedding2 = np.array(embeddings[j])
if context.debug:
print(f"embedding1: {embedding1[-8:]}\n")
print(f"embedding2: {embedding2[-8:]}\n")
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
msg = f"Similarity between {i} and {j}: {similarity:.10f}"
if context.debug:
print(f"{msg}\n")
assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
@step(u'embeddings are generated') @step(u'embeddings are generated')
def step_assert_embeddings(context): def step_assert_embeddings(context):
if len(context.prompts) == 0: assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
assert_embeddings(context.embeddings) f"context.n_prompts={context.n_prompts}\n"
else: f"context.embeddings={context.embeddings}")
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n" for embedding in context.embeddings:
f"context.prompts={context.prompts}\n" assert_embeddings(embedding)
f"context.embeddings={context.embeddings}")
for embedding in context.embeddings:
context.prompts.pop()
assert_embeddings(embedding)
@step(u'an OAI compatible embeddings computation request for') @step(u'an OAI compatible embeddings computation request for')
@async_run_until_complete @async_run_until_complete
async def step_oai_compute_embeddings(context): async def step_oai_compute_embeddings(context):
context.n_prompts = 1
context.embeddings = await request_oai_embeddings(context.text, context.embeddings = await request_oai_embeddings(context.text,
base_url=context.base_url, base_url=context.base_url,
user_api_key=context.user_api_key, user_api_key=context.user_api_key,
@ -462,6 +499,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context):
base_url=context.base_url, base_url=context.base_url,
user_api_key=context.user_api_key, user_api_key=context.user_api_key,
model=context.model) model=context.model)
context.prompts.clear()
@step(u'concurrent embedding requests') @step(u'concurrent embedding requests')
@ -488,9 +526,9 @@ async def step_concurrent_oai_embedding_requests(context):
@async_run_until_complete() @async_run_until_complete()
async def all_embeddings_are_generated(context): async def all_embeddings_are_generated(context):
n_embedding_requests = await gather_tasks_results(context) n_embedding_requests = await gather_tasks_results(context)
assert n_embedding_requests > 0 assert n_embedding_requests == context.n_prompts
for i in range(n_embedding_requests): for i in range(n_embedding_requests):
assert_embeddings(context.tasks_result.pop()) assert_embeddings(context.tasks_result.pop().pop())
@step(u'tokenizing') @step(u'tokenizing')
@ -548,14 +586,24 @@ async def step_prometheus_metrics_exported(context):
metric_exported = False metric_exported = False
if context.debug: if context.debug:
print(f"/metrics answer:\n{metrics_raw}\n") print(f"/metrics answer:\n{metrics_raw}\n")
context.metrics = {}
for metric in parser.text_string_to_metric_families(metrics_raw): for metric in parser.text_string_to_metric_families(metrics_raw):
match metric.name: match metric.name:
case "llamacpp:kv_cache_usage_ratio": case "llamacpp:kv_cache_usage_ratio":
assert len(metric.samples) > 0 assert len(metric.samples) > 0
metric_exported = True metric_exported = True
context.metrics[metric.name] = metric
assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
assert metric_exported, "No metrics exported" assert metric_exported, "No metrics exported"
@step(u'metric {metric_name} is {metric_value:d}')
def step_assert_metric_value(context, metric_name, metric_value):
if metric_name not in context.metrics:
assert False, f"no metric {metric_name} in {context.metrics.keys()}"
assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
@step(u'available models') @step(u'available models')
def step_available_models(context): def step_available_models(context):
# openai client always expects an api_key # openai client always expects an api_key
@ -588,11 +636,11 @@ def step_supported_models(context, i_model, param, preposition, param_value):
async def concurrent_requests(context, f_completion, *args, **kwargs): async def concurrent_requests(context, f_completion, *args, **kwargs):
n_prompts = len(context.prompts) context.n_prompts = len(context.prompts)
if context.debug: if context.debug:
print(f"starting {n_prompts} concurrent completion requests...") print(f"starting {context.n_prompts} concurrent completion requests...")
assert n_prompts > 0 assert context.n_prompts > 0
for prompt_no in range(n_prompts): for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), *args] shifted_args = [context.prompts.pop(), *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
@ -765,7 +813,7 @@ async def request_embedding(content, base_url=None):
}) as response: }) as response:
assert response.status == 200 assert response.status == 200
response_json = await response.json() response_json = await response.json()
return response_json['embedding'] return [response_json['embedding']]
async def request_oai_embeddings(input, async def request_oai_embeddings(input,
@ -775,6 +823,7 @@ async def request_oai_embeddings(input,
user_api_key = user_api_key if user_api_key is not None else 'nope' user_api_key = user_api_key if user_api_key is not None else 'nope'
if async_client: if async_client:
origin = 'llama.cpp' origin = 'llama.cpp'
headers=[]
if user_api_key is not None: if user_api_key is not None:
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
@ -783,14 +832,21 @@ async def request_oai_embeddings(input,
"input": input, "input": input,
"model": model, "model": model,
}, },
headers=headers) as response: headers=headers,
timeout=3600) as response:
assert response.status == 200, f"received status code not expected: {response.status}" assert response.status == 200, f"received status code not expected: {response.status}"
assert response.headers['Access-Control-Allow-Origin'] == origin assert response.headers['Access-Control-Allow-Origin'] == origin
assert response.headers['Content-Type'] == "application/json; charset=utf-8" assert response.headers['Content-Type'] == "application/json; charset=utf-8"
response_json = await response.json() response_json = await response.json()
assert response_json['model'] == model, f"invalid model received: {response_json['model']}" assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
assert response_json['object'] == 'list' assert response_json['object'] == 'list'
return response_json['data'] if isinstance(input, collections.abc.Sequence):
embeddings = []
for an_oai_embeddings in response_json['data']:
embeddings.append(an_oai_embeddings['embedding'])
else:
embeddings = [response_json['data']['embedding']]
return embeddings
else: else:
openai.api_key = user_api_key openai.api_key = user_api_key
openai.api_base = f'{base_url}/v1' openai.api_base = f'{base_url}/v1'
@ -804,7 +860,7 @@ async def request_oai_embeddings(input,
for an_oai_embeddings in oai_embeddings.data: for an_oai_embeddings in oai_embeddings.data:
embeddings.append(an_oai_embeddings.embedding) embeddings.append(an_oai_embeddings.embedding)
else: else:
embeddings = oai_embeddings.data.embedding embeddings = [oai_embeddings.data.embedding]
return embeddings return embeddings
@ -833,7 +889,6 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
f' {n_predicted} <> {expected_predicted_n}') f' {n_predicted} <> {expected_predicted_n}')
async def gather_tasks_results(context): async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks) n_tasks = len(context.concurrent_tasks)
if context.debug: if context.debug:
@ -899,6 +954,8 @@ def assert_embeddings(embeddings):
assert len(embeddings) > 0 assert len(embeddings) > 0
embeddings_computed = False embeddings_computed = False
for emb in embeddings: for emb in embeddings:
if not isinstance(emb, float):
assert False, f"Bad embeddings: {embeddings}"
if emb != 0: if emb != 0:
embeddings_computed = True embeddings_computed = True
assert embeddings_computed, f"Embeddings: {embeddings}" assert embeddings_computed, f"Embeddings: {embeddings}"

View file

@ -1,5 +1,6 @@
aiohttp~=3.9.3 aiohttp~=3.9.3
behave~=1.2.6 behave~=1.2.6
huggingface_hub~=0.20.3 huggingface_hub~=0.20.3
numpy~=1.24.4
openai~=0.25.0 openai~=0.25.0
prometheus-client~=0.20.0 prometheus-client~=0.20.0

View file

@ -1,15 +1,16 @@
#pragma once #pragma once
#include <string> #include "llama.h"
#include <vector> #include "common.h"
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include "json.hpp" #include "json.hpp"
#include "../llava/clip.h" #include <string>
#include <vector>
#include <sstream>
#include <random>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json; using json = nlohmann::json;
@ -101,6 +102,7 @@ LogRedirection log_settings; // TODO: avoid global declaration
// parallel // parallel
// //
<<<<<<< HEAD
enum server_state { enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded SERVER_STATE_READY, // Server is ready and model is loaded
@ -156,6 +158,15 @@ struct token_translator {
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
}; };
=======
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value) {
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
>>>>>>> origin/master
static inline void server_log( static inline void server_log(
const char *level, const char *level,
@ -174,7 +185,7 @@ static inline void server_log(
std::stringstream ss_tid; std::stringstream ss_tid;
ss_tid << std::this_thread::get_id(); ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{ json log = nlohmann::ordered_json{
{"tid", ss_tid.str()}, {"tid", ss_tid.str()},
{"timestamp", time(nullptr)}, {"timestamp", time(nullptr)},
}; };
@ -201,19 +212,24 @@ static inline void server_log(
//freopen(stderr_target.c_str(), "a", stderr); // we assign stderr to dev/null effectively 'blackholing' the output because log.dump below is redirected too //freopen(stderr_target.c_str(), "a", stderr); // we assign stderr to dev/null effectively 'blackholing' the output because log.dump below is redirected too
if (server_log_json) { if (server_log_json) {
log.merge_patch( log.merge_patch( {
{ {"level", level},
{"level", level}, {"function", function},
{"function", function}, {"line", line},
{"line", line}, {"msg", message},
{"msg", message}, });
});
if (!extra.empty()) { if (!extra.empty()) {
log.merge_patch(extra); log.merge_patch(extra);
} }
std::cerr << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; // was originally std:cout std::cerr << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; // was originally std:cout
<<<<<<< HEAD
} else { // store the logs in (because not json) text format } else { // store the logs in (because not json) text format
=======
printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
} else {
>>>>>>> origin/master
char buf[1024]; char buf[1024];
snprintf(buf, 1024, "\033[85;0H%4s [%24s] %s", level, function, message); snprintf(buf, 1024, "\033[85;0H%4s [%24s] %s", level, function, message);
@ -251,22 +267,13 @@ static inline void server_log(
} }
// //
// server utils // chat template utils
// //
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value) {
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
inline bool verify_custom_template(const std::string & tmpl) { inline bool verify_custom_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}}; llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1); int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
return res >= 0; return res >= 0;
} }
@ -278,7 +285,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
std::vector<llama_chat_message> chat(messages.size()); std::vector<llama_chat_message> chat(messages.size());
for (size_t i = 0; i < messages.size(); ++i) { for (size_t i = 0; i < messages.size(); ++i) {
auto &curr_msg = messages[i]; const auto & curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length(); alloc_size += str[i*2 + 1].length();
@ -298,13 +305,20 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
} }
<<<<<<< HEAD
std::string formatted_chat(buf.data(), res); std::string formatted_chat(buf.data(), res);
LOG_VERBOSE("formatted_chat", {"text", formatted_chat.c_str()}); LOG_VERBOSE("formatted_chat", {"text", formatted_chat.c_str()});
=======
const std::string formatted_chat(buf.data(), res);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
>>>>>>> origin/master
return formatted_chat; return formatted_chat;
} }
// //
<<<<<<< HEAD
// work queue utils // work queue utils
// //
@ -562,6 +576,8 @@ struct llama_server_response {
}; };
// //
=======
>>>>>>> origin/master
// base64 utils (TODO: move to common in the future) // base64 utils (TODO: move to common in the future)
// //
@ -570,13 +586,11 @@ static const std::string base64_chars =
"abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz"
"0123456789+/"; "0123456789+/";
static inline bool is_base64(uint8_t c) static inline bool is_base64(uint8_t c) {
{
return (isalnum(c) || (c == '+') || (c == '/')); return (isalnum(c) || (c == '+') || (c == '/'));
} }
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
{
int i = 0; int i = 0;
int j = 0; int j = 0;
int in_ = 0; int in_ = 0;
@ -588,13 +602,10 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
std::vector<uint8_t> ret; std::vector<uint8_t> ret;
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
{
char_array_4[i++] = encoded_string[in_]; in_++; char_array_4[i++] = encoded_string[in_]; in_++;
if (i == 4) if (i == 4) {
{ for (i = 0; i < 4; i++) {
for (i = 0; i <4; i++)
{
char_array_4[i] = base64_chars.find(char_array_4[i]); char_array_4[i] = base64_chars.find(char_array_4[i]);
} }
@ -602,23 +613,20 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++) for (i = 0; (i < 3); i++) {
{
ret.push_back(char_array_3[i]); ret.push_back(char_array_3[i]);
} }
i = 0; i = 0;
} }
} }
if (i) if (i) {
{ for (j = i; j < 4; j++) {
for (j = i; j <4; j++)
{
char_array_4[j] = 0; char_array_4[j] = 0;
} }
for (j = 0; j <4; j++) for (j = 0; j < 4; j++) {
{
char_array_4[j] = base64_chars.find(char_array_4[j]); char_array_4[j] = base64_chars.find(char_array_4[j]);
} }
@ -626,8 +634,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++) for (j = 0; j < i - 1; j++) {
{
ret.push_back(char_array_3[j]); ret.push_back(char_array_3[j]);
} }
} }
@ -639,8 +646,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
// random string / id // random string / id
// //
static std::string random_string() static std::string random_string() {
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
std::random_device rd; std::random_device rd;
@ -655,10 +661,10 @@ static std::string random_string()
return result; return result;
} }
static std::string gen_chatcmplid() static std::string gen_chatcmplid() {
{
std::stringstream chatcmplid; std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string(); chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str(); return chatcmplid.str();
} }
@ -666,91 +672,316 @@ static std::string gen_chatcmplid()
// other common utils // other common utils
// //
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b) static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
{
size_t i; size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
{
}
return i; return i;
} }
static bool ends_with(const std::string &str, const std::string &suffix) static bool ends_with(const std::string & str, const std::string & suffix) {
{ return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
return str.size() >= suffix.size() &&
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
} }
static size_t find_partial_stop_string(const std::string &stop, static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
const std::string &text) if (!text.empty() && !stop.empty()) {
{
if (!text.empty() && !stop.empty())
{
const char text_last_char = text.back(); const char text_last_char = text.back();
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
{ if (stop[char_index] == text_last_char) {
if (stop[char_index] == text_last_char)
{
const std::string current_partial = stop.substr(0, char_index + 1); const std::string current_partial = stop.substr(0, char_index + 1);
if (ends_with(text, current_partial)) if (ends_with(text, current_partial)) {
{
return text.size() - char_index - 1; return text.size() - char_index - 1;
} }
} }
} }
} }
return std::string::npos; return std::string::npos;
} }
// TODO: reuse llama_detokenize // TODO: reuse llama_detokenize
template <class Iter> template <class Iter>
static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
{
std::string ret; std::string ret;
for (; begin != end; ++begin) for (; begin != end; ++begin) {
{
ret += llama_token_to_piece(ctx, *begin); ret += llama_token_to_piece(ctx, *begin);
} }
return ret; return ret;
} }
// format incomplete utf-8 multibyte character for output // format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
{
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
// if the size is 1 and first bit is 1, meaning it's a partial character // if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token) // (size > 1 meaning it's already a known token)
if (out.size() == 1 && (out[0] & 0x80) == 0x80) if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
{
std::stringstream ss; std::stringstream ss;
ss << std::hex << (out[0] & 0xff); ss << std::hex << (out[0] & 0xff);
std::string res(ss.str()); std::string res(ss.str());
out = "byte: \\x" + res; out = "byte: \\x" + res;
} }
return out; return out;
} }
struct completion_token_output {
llama_token tok;
std::string text_to_send;
struct token_prob {
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
};
// convert a vector of completion_token_output to json // convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs) static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
{
json out = json::array(); json out = json::array();
for (const auto &prob : probs)
{ for (const auto & prob : probs) {
json probs_for_token = json::array(); json probs_for_token = json::array();
for (const auto &p : prob.probs)
{ for (const auto & p : prob.probs) {
std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
probs_for_token.push_back(json probs_for_token.push_back(json {
{
{"tok_str", tok_str}, {"tok_str", tok_str},
{"prob", p.prob}, {"prob", p.prob},
}); });
} }
std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
out.push_back(json{ const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
out.push_back(json {
{"content", tok_str}, {"content", tok_str},
{"probs", probs_for_token}, {"probs", probs_for_token},
}); });
} }
return out; return out;
} }
//
// OAI utils
//
static json oaicompat_completion_params_parse(
const struct llama_model * model,
const json & body, /* openai api json semantics */
const std::string & chat_template) {
json llama_params;
llama_params["__oaicompat"] = true;
// Map OpenAI parameters to llama.cpp parameters
//
// For parameters that are defined by the OpenAI documentation (e.g.
// temperature), we explicitly specify OpenAI's intended default; we
// need to do that because sometimes OpenAI disagrees with llama.cpp
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
llama_params["top_p"] = json_value(body, "top_p", 1.0);
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
if (body.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object());
}
// Handle 'stop' field
if (body.contains("stop") && body["stop"].is_string()) {
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
} else {
llama_params["stop"] = json_value(body, "stop", json::array());
}
// Ensure there is ChatML-specific end sequence among stop words
llama_params["stop"].push_back("<|im_end|>");
return llama_params;
}
static json format_final_response_oaicompat(const json & request, json result, bool streaming = false) {
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason = "length";
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
json choices =
streaming ? json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}})
: json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"message", json{{"content", content},
{"role", "assistant"}}}}});
std::time_t t = std::time(0);
json res = json {
{"choices", choices},
{"created", t},
{"model",
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage", json {
{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}},
{"id", gen_chatcmplid()}
};
if (server_verbose) {
res["__verbose"] = result;
}
if (result.contains("completion_probabilities")) {
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
return res;
}
// return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(json result) {
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({result});
}
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
bool stopped_word = json_value(result, "stopped_word", false);
bool stopped_eos = json_value(result, "stopped_eos", false);
bool stopped_limit = json_value(result, "stopped_limit", false);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason;
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
if (stopped_limit) {
finish_reason = "length";
}
std::time_t t = std::time(0);
json choices;
if (!finish_reason.empty()) {
choices = json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}});
} else {
if (first) {
if (content.empty()) {
choices = json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{{"role", "assistant"}}}}});
} else {
// We have to send this as two updates to conform to openai behavior
json initial_ret = json{{"choices", json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"role", "assistant"}
}}}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"content", content}}}
}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({initial_ret, second_ret});
}
} else {
// Some idiosyncrasy in task processing logic makes several trailing calls
// with empty content, we ignore these at the calee site.
if (content.empty()) {
return std::vector<json>({json::object()});
}
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta",
json{
{"content", content},
}},
}});
}
}
json ret = json {
{"choices", choices},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}
};
return std::vector<json>({ret});
}
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"},
{"usage", json {
{"prompt_tokens", 0},
{"total_tokens", 0}
}},
{"data", embeddings}
};
return res;
}
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
return json {
{"tokens", tokens}
};
}
static json format_detokenized_response(const std::string & content) {
return json {
{"content", content}
};
}

5
ggml.c
View file

@ -2154,7 +2154,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node); getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
#else #else
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node); # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
# endif
getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
#endif #endif
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {

View file

@ -3888,7 +3888,13 @@ static bool llm_load_tensors(
{ {
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
if (model.arch != LLM_ARCH_MINICPM){ if (model.arch != LLM_ARCH_MINICPM){
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
}
} }
} }
@ -13533,18 +13539,22 @@ LLAMA_API int32_t llama_chat_apply_template(
curr_tmpl = std::string(model_template.data(), model_template.size()); curr_tmpl = std::string(model_template.data(), model_template.size());
} }
} }
// format the chat to string // format the chat to string
std::vector<const llama_chat_message *> chat_vec; std::vector<const llama_chat_message *> chat_vec;
chat_vec.resize(n_msg); chat_vec.resize(n_msg);
for (size_t i = 0; i < n_msg; i++) { for (size_t i = 0; i < n_msg; i++) {
chat_vec[i] = &chat[i]; chat_vec[i] = &chat[i];
} }
std::string formatted_chat; std::string formatted_chat;
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass); int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
if (res < 0) { if (res < 0) {
return res; return res;
} }
strncpy(buf, formatted_chat.c_str(), length); if (buf && length > 0) {
strncpy(buf, formatted_chat.c_str(), length);
}
return res; return res;
} }