Merge branch 'master' into support-mamba-ssm
This commit is contained in:
commit
916b586386
21 changed files with 2420 additions and 2811 deletions
3
.github/workflows/server.yml
vendored
3
.github/workflows/server.yml
vendored
|
@ -58,7 +58,8 @@ jobs:
|
||||||
cmake \
|
cmake \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
wget \
|
wget \
|
||||||
psmisc
|
psmisc \
|
||||||
|
language-pack-en
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
|
5
Makefile
5
Makefile
|
@ -724,10 +724,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
|
|
@ -45,7 +45,8 @@ fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
if [ -z ${ONEAPI_ROOT} ]; then
|
if [ -z ${ONEAPI_ROOT} ]; then
|
||||||
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
|
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
|
||||||
|
echo "source /opt/intel/oneapi/setvars.sh"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
13
convert.py
13
convert.py
|
@ -1377,7 +1377,6 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
# We currently only support Q8_0 output on little endian systems.
|
# We currently only support Q8_0 output on little endian systems.
|
||||||
output_choices.append("q8_0")
|
output_choices.append("q8_0")
|
||||||
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
|
||||||
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
|
|
||||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
|
@ -1393,18 +1392,6 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
if args.awq_path:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
|
||||||
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
|
||||||
tmp_model_path = args.model / "weighted_model"
|
|
||||||
if tmp_model_path.is_dir():
|
|
||||||
print(f"{tmp_model_path} exists as a weighted model.")
|
|
||||||
else:
|
|
||||||
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
print("Saving new weighted model ...")
|
|
||||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
|
||||||
print(f"Saved weighted model at {tmp_model_path}.")
|
|
||||||
args.model = tmp_model_path
|
|
||||||
|
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
model_plus = lazy_load_file(args.model)
|
model_plus = lazy_load_file(args.model)
|
||||||
|
|
|
@ -173,6 +173,7 @@ struct cmd_params {
|
||||||
std::vector<bool> no_kv_offload;
|
std::vector<bool> no_kv_offload;
|
||||||
std::vector<std::vector<float>> tensor_split;
|
std::vector<std::vector<float>> tensor_split;
|
||||||
std::vector<bool> use_mmap;
|
std::vector<bool> use_mmap;
|
||||||
|
std::vector<bool> embeddings;
|
||||||
int reps;
|
int reps;
|
||||||
bool verbose;
|
bool verbose;
|
||||||
output_formats output_format;
|
output_formats output_format;
|
||||||
|
@ -192,6 +193,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* no_kv_offload */ {false},
|
/* no_kv_offload */ {false},
|
||||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||||
/* use_mmap */ {true},
|
/* use_mmap */ {true},
|
||||||
|
/* embeddings */ {false},
|
||||||
/* reps */ 5,
|
/* reps */ 5,
|
||||||
/* verbose */ false,
|
/* verbose */ false,
|
||||||
/* output_format */ MARKDOWN
|
/* output_format */ MARKDOWN
|
||||||
|
@ -214,6 +216,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||||
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||||
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
||||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||||
|
@ -382,6 +385,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
auto p = split<bool>(argv[i], split_delim);
|
auto p = split<bool>(argv[i], split_delim);
|
||||||
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "-embd" || arg == "--embeddings") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = split<bool>(argv[i], split_delim);
|
||||||
|
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
||||||
} else if (arg == "-ts" || arg == "--tensor-split") {
|
} else if (arg == "-ts" || arg == "--tensor-split") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -453,6 +463,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
||||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||||
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||||
|
|
||||||
return params;
|
return params;
|
||||||
|
@ -472,6 +483,7 @@ struct cmd_params_instance {
|
||||||
bool no_kv_offload;
|
bool no_kv_offload;
|
||||||
std::vector<float> tensor_split;
|
std::vector<float> tensor_split;
|
||||||
bool use_mmap;
|
bool use_mmap;
|
||||||
|
bool embeddings;
|
||||||
|
|
||||||
llama_model_params to_llama_mparams() const {
|
llama_model_params to_llama_mparams() const {
|
||||||
llama_model_params mparams = llama_model_default_params();
|
llama_model_params mparams = llama_model_default_params();
|
||||||
|
@ -502,6 +514,7 @@ struct cmd_params_instance {
|
||||||
cparams.type_k = type_k;
|
cparams.type_k = type_k;
|
||||||
cparams.type_v = type_v;
|
cparams.type_v = type_v;
|
||||||
cparams.offload_kqv = !no_kv_offload;
|
cparams.offload_kqv = !no_kv_offload;
|
||||||
|
cparams.embeddings = embeddings;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
@ -517,6 +530,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
for (const auto & mmp : params.use_mmap)
|
for (const auto & mmp : params.use_mmap)
|
||||||
|
for (const auto & embd : params.embeddings)
|
||||||
for (const auto & nb : params.n_batch)
|
for (const auto & nb : params.n_batch)
|
||||||
for (const auto & tk : params.type_k)
|
for (const auto & tk : params.type_k)
|
||||||
for (const auto & tv : params.type_v)
|
for (const auto & tv : params.type_v)
|
||||||
|
@ -540,6 +554,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .use_mmap = */ mmp,
|
/* .use_mmap = */ mmp,
|
||||||
|
/* .embeddings = */ embd,
|
||||||
};
|
};
|
||||||
instances.push_back(instance);
|
instances.push_back(instance);
|
||||||
}
|
}
|
||||||
|
@ -562,6 +577,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .use_mmap = */ mmp,
|
/* .use_mmap = */ mmp,
|
||||||
|
/* .embeddings = */ embd,
|
||||||
};
|
};
|
||||||
instances.push_back(instance);
|
instances.push_back(instance);
|
||||||
}
|
}
|
||||||
|
@ -597,6 +613,7 @@ struct test {
|
||||||
bool no_kv_offload;
|
bool no_kv_offload;
|
||||||
std::vector<float> tensor_split;
|
std::vector<float> tensor_split;
|
||||||
bool use_mmap;
|
bool use_mmap;
|
||||||
|
bool embeddings;
|
||||||
int n_prompt;
|
int n_prompt;
|
||||||
int n_gen;
|
int n_gen;
|
||||||
std::string test_time;
|
std::string test_time;
|
||||||
|
@ -619,6 +636,7 @@ struct test {
|
||||||
no_kv_offload = inst.no_kv_offload;
|
no_kv_offload = inst.no_kv_offload;
|
||||||
tensor_split = inst.tensor_split;
|
tensor_split = inst.tensor_split;
|
||||||
use_mmap = inst.use_mmap;
|
use_mmap = inst.use_mmap;
|
||||||
|
embeddings = inst.embeddings;
|
||||||
n_prompt = inst.n_prompt;
|
n_prompt = inst.n_prompt;
|
||||||
n_gen = inst.n_gen;
|
n_gen = inst.n_gen;
|
||||||
// RFC 3339 date-time format
|
// RFC 3339 date-time format
|
||||||
|
@ -690,7 +708,7 @@ struct test {
|
||||||
"n_batch", "n_threads", "type_k", "type_v",
|
"n_batch", "n_threads", "type_k", "type_v",
|
||||||
"n_gpu_layers", "split_mode",
|
"n_gpu_layers", "split_mode",
|
||||||
"main_gpu", "no_kv_offload",
|
"main_gpu", "no_kv_offload",
|
||||||
"tensor_split", "use_mmap",
|
"tensor_split", "use_mmap", "embeddings",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
"avg_ts", "stddev_ts"
|
"avg_ts", "stddev_ts"
|
||||||
|
@ -710,7 +728,7 @@ struct test {
|
||||||
}
|
}
|
||||||
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||||
field == "use_mmap") {
|
field == "use_mmap" || field == "embeddings") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
if (field == "avg_ts" || field == "stddev_ts") {
|
if (field == "avg_ts" || field == "stddev_ts") {
|
||||||
|
@ -744,7 +762,7 @@ struct test {
|
||||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||||
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
||||||
tensor_split_str, std::to_string(use_mmap),
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||||
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
||||||
|
@ -914,6 +932,9 @@ struct markdown_printer : public printer {
|
||||||
if (field == "use_mmap") {
|
if (field == "use_mmap") {
|
||||||
return "mmap";
|
return "mmap";
|
||||||
}
|
}
|
||||||
|
if (field == "embeddings") {
|
||||||
|
return "embd";
|
||||||
|
}
|
||||||
if (field == "tensor_split") {
|
if (field == "tensor_split") {
|
||||||
return "ts";
|
return "ts";
|
||||||
}
|
}
|
||||||
|
@ -957,6 +978,9 @@ struct markdown_printer : public printer {
|
||||||
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
||||||
fields.emplace_back("use_mmap");
|
fields.emplace_back("use_mmap");
|
||||||
}
|
}
|
||||||
|
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
||||||
|
fields.emplace_back("embeddings");
|
||||||
|
}
|
||||||
fields.emplace_back("test");
|
fields.emplace_back("test");
|
||||||
fields.emplace_back("t/s");
|
fields.emplace_back("t/s");
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ async def main():
|
||||||
model_url = "http://127.0.0.1:6900"
|
model_url = "http://127.0.0.1:6900"
|
||||||
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
|
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
|
||||||
url= f"{model_url}/embedding",
|
url= f"{model_url}/embedding",
|
||||||
json= {"content": str(i)*1024}
|
json= {"content": str(0)*1024}
|
||||||
) for i in range(n)])
|
) for i in range(n)])
|
||||||
|
|
||||||
for response in responses:
|
for response in responses:
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
set(TARGET server)
|
set(TARGET server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
|
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -436,7 +436,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
"next_token": {
|
"next_token": {
|
||||||
"has_next_token": true,
|
"has_next_token": true,
|
||||||
"n_remain": -1,
|
"n_remain": -1,
|
||||||
"num_tokens_predicted": 0,
|
"n_decoded": 0,
|
||||||
"stopped_eos": false,
|
"stopped_eos": false,
|
||||||
"stopped_limit": false,
|
"stopped_limit": false,
|
||||||
"stopped_word": false,
|
"stopped_word": false,
|
||||||
|
|
|
@ -1,225 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <set>
|
|
||||||
#include <mutex>
|
|
||||||
#include <condition_variable>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include "json.hpp"
|
|
||||||
#include "utils.hpp"
|
|
||||||
|
|
||||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
|
||||||
|
|
||||||
using json = nlohmann::json;
|
|
||||||
|
|
||||||
inline static json oaicompat_completion_params_parse(
|
|
||||||
const struct llama_model * model,
|
|
||||||
const json &body, /* openai api json semantics */
|
|
||||||
const std::string &chat_template)
|
|
||||||
{
|
|
||||||
json llama_params;
|
|
||||||
|
|
||||||
llama_params["__oaicompat"] = true;
|
|
||||||
|
|
||||||
// Map OpenAI parameters to llama.cpp parameters
|
|
||||||
//
|
|
||||||
// For parameters that are defined by the OpenAI documentation (e.g.
|
|
||||||
// temperature), we explicitly specify OpenAI's intended default; we
|
|
||||||
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
|
||||||
//
|
|
||||||
// https://platform.openai.com/docs/api-reference/chat/create
|
|
||||||
llama_sampling_params default_sparams;
|
|
||||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
|
||||||
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
|
|
||||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
|
||||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
|
||||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
|
||||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
|
||||||
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
|
||||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
|
||||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
|
||||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
|
||||||
llama_params["stream"] = json_value(body, "stream", false);
|
|
||||||
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
|
|
||||||
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
|
||||||
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
|
||||||
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
|
||||||
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
|
|
||||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
|
||||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
|
||||||
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
|
||||||
|
|
||||||
if (body.count("grammar") != 0) {
|
|
||||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle 'stop' field
|
|
||||||
if (body.contains("stop") && body["stop"].is_string()) {
|
|
||||||
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
|
||||||
} else {
|
|
||||||
llama_params["stop"] = json_value(body, "stop", json::array());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure there is ChatML-specific end sequence among stop words
|
|
||||||
llama_params["stop"].push_back("<|im_end|>");
|
|
||||||
|
|
||||||
return llama_params;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
|
|
||||||
{
|
|
||||||
json result = response.result_json;
|
|
||||||
|
|
||||||
bool stopped_word = result.count("stopped_word") != 0;
|
|
||||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
|
||||||
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
|
||||||
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
|
||||||
std::string content = json_value(result, "content", std::string(""));
|
|
||||||
|
|
||||||
std::string finish_reason = "length";
|
|
||||||
if (stopped_word || stopped_eos) {
|
|
||||||
finish_reason = "stop";
|
|
||||||
}
|
|
||||||
|
|
||||||
json choices =
|
|
||||||
streaming ? json::array({json{{"finish_reason", finish_reason},
|
|
||||||
{"index", 0},
|
|
||||||
{"delta", json::object()}}})
|
|
||||||
: json::array({json{{"finish_reason", finish_reason},
|
|
||||||
{"index", 0},
|
|
||||||
{"message", json{{"content", content},
|
|
||||||
{"role", "assistant"}}}}});
|
|
||||||
|
|
||||||
std::time_t t = std::time(0);
|
|
||||||
|
|
||||||
json res =
|
|
||||||
json{{"choices", choices},
|
|
||||||
{"created", t},
|
|
||||||
{"model",
|
|
||||||
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
||||||
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
|
||||||
{"usage",
|
|
||||||
json{{"completion_tokens", num_tokens_predicted},
|
|
||||||
{"prompt_tokens", num_prompt_tokens},
|
|
||||||
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
|
|
||||||
{"id", gen_chatcmplid()}};
|
|
||||||
|
|
||||||
if (server_verbose) {
|
|
||||||
res["__verbose"] = result;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result.contains("completion_probabilities")) {
|
|
||||||
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// return value is vector as there is one case where we might need to generate two responses
|
|
||||||
inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
|
|
||||||
json result = response.result_json;
|
|
||||||
|
|
||||||
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
|
||||||
return std::vector<json>({response.result_json});
|
|
||||||
}
|
|
||||||
|
|
||||||
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
|
||||||
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
|
||||||
|
|
||||||
bool stopped_word = json_value(result, "stopped_word", false);
|
|
||||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
|
||||||
bool stopped_limit = json_value(result, "stopped_limit", false);
|
|
||||||
std::string content = json_value(result, "content", std::string(""));
|
|
||||||
|
|
||||||
std::string finish_reason;
|
|
||||||
if (stopped_word || stopped_eos) {
|
|
||||||
finish_reason = "stop";
|
|
||||||
}
|
|
||||||
if (stopped_limit) {
|
|
||||||
finish_reason = "length";
|
|
||||||
}
|
|
||||||
|
|
||||||
std::time_t t = std::time(0);
|
|
||||||
|
|
||||||
json choices;
|
|
||||||
|
|
||||||
if (!finish_reason.empty()) {
|
|
||||||
choices = json::array({json{{"finish_reason", finish_reason},
|
|
||||||
{"index", 0},
|
|
||||||
{"delta", json::object()}}});
|
|
||||||
} else {
|
|
||||||
if (first) {
|
|
||||||
if (content.empty()) {
|
|
||||||
choices = json::array({json{{"finish_reason", nullptr},
|
|
||||||
{"index", 0},
|
|
||||||
{"delta", json{{"role", "assistant"}}}}});
|
|
||||||
} else {
|
|
||||||
// We have to send this as two updates to conform to openai behavior
|
|
||||||
json initial_ret = json{{"choices", json::array({json{
|
|
||||||
{"finish_reason", nullptr},
|
|
||||||
{"index", 0},
|
|
||||||
{"delta", json{
|
|
||||||
{"role", "assistant"}
|
|
||||||
}}}})},
|
|
||||||
{"created", t},
|
|
||||||
{"id", gen_chatcmplid()},
|
|
||||||
{"model", modelname},
|
|
||||||
{"object", "chat.completion.chunk"}};
|
|
||||||
|
|
||||||
json second_ret = json{
|
|
||||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
|
||||||
{"index", 0},
|
|
||||||
{"delta", json{
|
|
||||||
{"content", content}}}
|
|
||||||
}})},
|
|
||||||
{"created", t},
|
|
||||||
{"id", gen_chatcmplid()},
|
|
||||||
{"model", modelname},
|
|
||||||
{"object", "chat.completion.chunk"}};
|
|
||||||
|
|
||||||
return std::vector<json>({initial_ret, second_ret});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Some idiosyncrasy in task processing logic makes several trailing calls
|
|
||||||
// with empty content, we ignore these at the calee site.
|
|
||||||
if (content.empty()) {
|
|
||||||
return std::vector<json>({json::object()});
|
|
||||||
}
|
|
||||||
|
|
||||||
choices = json::array({json{
|
|
||||||
{"finish_reason", nullptr},
|
|
||||||
{"index", 0},
|
|
||||||
{"delta",
|
|
||||||
json{
|
|
||||||
{"content", content},
|
|
||||||
}},
|
|
||||||
}});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
json ret = json{{"choices", choices},
|
|
||||||
{"created", t},
|
|
||||||
{"id", gen_chatcmplid()},
|
|
||||||
{"model", modelname},
|
|
||||||
{"object", "chat.completion.chunk"}};
|
|
||||||
|
|
||||||
return std::vector<json>({ret});
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
|
|
||||||
{
|
|
||||||
json res =
|
|
||||||
json{
|
|
||||||
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
|
||||||
{"object", "list"},
|
|
||||||
{"usage",
|
|
||||||
json{{"prompt_tokens", 0},
|
|
||||||
{"total_tokens", 0}}},
|
|
||||||
{"data", embeddings}
|
|
||||||
};
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
94
examples/server/tests/features/embeddings.feature
Normal file
94
examples/server/tests/features/embeddings.feature
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
@llama.cpp
|
||||||
|
@embeddings
|
||||||
|
Feature: llama.cpp server
|
||||||
|
|
||||||
|
Background: Server startup
|
||||||
|
Given a server listening on localhost:8080
|
||||||
|
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
|
||||||
|
And a model alias bert-bge-small
|
||||||
|
And 42 as server seed
|
||||||
|
And 2 slots
|
||||||
|
And 1024 as batch size
|
||||||
|
And 2048 KV cache size
|
||||||
|
And embeddings extraction
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Scenario: Embedding
|
||||||
|
When embeddings are computed for:
|
||||||
|
"""
|
||||||
|
What is the capital of Bulgaria ?
|
||||||
|
"""
|
||||||
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: OAI Embeddings compatibility
|
||||||
|
Given a model bert-bge-small
|
||||||
|
When an OAI compatible embeddings computation request for:
|
||||||
|
"""
|
||||||
|
What is the capital of Spain ?
|
||||||
|
"""
|
||||||
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: OAI Embeddings compatibility with multiple inputs
|
||||||
|
Given a model bert-bge-small
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
In which country Paris is located ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Is Madrid the capital of Spain ?
|
||||||
|
"""
|
||||||
|
When an OAI compatible embeddings computation request for multiple inputs
|
||||||
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: Multi users embeddings
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another very long music lyrics.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long poem.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long joke.
|
||||||
|
"""
|
||||||
|
Given concurrent embedding requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all embeddings are generated
|
||||||
|
|
||||||
|
Scenario: Multi users OAI compatibility embeddings
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
In which country Paris is located ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Is Madrid the capital of Spain ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is the biggest US city ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is the capital of Bulgaria ?
|
||||||
|
"""
|
||||||
|
And a model bert-bge-small
|
||||||
|
Given concurrent OAI embedding requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all embeddings are generated
|
||||||
|
|
||||||
|
Scenario: All embeddings should be the same
|
||||||
|
Given 10 fixed prompts
|
||||||
|
And a model bert-bge-small
|
||||||
|
Given concurrent OAI embedding requests
|
||||||
|
Then all embeddings are the same
|
|
@ -9,7 +9,6 @@ Feature: Parallel
|
||||||
And 512 as batch size
|
And 512 as batch size
|
||||||
And 64 KV cache size
|
And 64 KV cache size
|
||||||
And 2 slots
|
And 2 slots
|
||||||
And embeddings extraction
|
|
||||||
And continuous batching
|
And continuous batching
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
Then the server is healthy
|
||||||
|
@ -99,48 +98,3 @@ Feature: Parallel
|
||||||
Then the server is busy
|
Then the server is busy
|
||||||
Then the server is idle
|
Then the server is idle
|
||||||
Then all prompts are predicted
|
Then all prompts are predicted
|
||||||
|
|
||||||
Scenario: Multi users embeddings
|
|
||||||
Given a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long story about AI.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write another very long music lyrics.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long poem.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long joke.
|
|
||||||
"""
|
|
||||||
Given concurrent embedding requests
|
|
||||||
Then the server is busy
|
|
||||||
Then the server is idle
|
|
||||||
Then all embeddings are generated
|
|
||||||
|
|
||||||
Scenario: Multi users OAI compatibility embeddings
|
|
||||||
Given a prompt:
|
|
||||||
"""
|
|
||||||
In which country Paris is located ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Is Madrid the capital of Spain ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
What is the biggest US city ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
What is the capital of Bulgaria ?
|
|
||||||
"""
|
|
||||||
And a model tinyllama-2
|
|
||||||
Given concurrent OAI embedding requests
|
|
||||||
Then the server is busy
|
|
||||||
Then the server is idle
|
|
||||||
Then all embeddings are generated
|
|
||||||
|
|
|
@ -49,34 +49,6 @@ Feature: llama.cpp server
|
||||||
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
|
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
|
||||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
|
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
|
||||||
|
|
||||||
Scenario: Embedding
|
|
||||||
When embeddings are computed for:
|
|
||||||
"""
|
|
||||||
What is the capital of Bulgaria ?
|
|
||||||
"""
|
|
||||||
Then embeddings are generated
|
|
||||||
|
|
||||||
Scenario: OAI Embeddings compatibility
|
|
||||||
Given a model tinyllama-2
|
|
||||||
When an OAI compatible embeddings computation request for:
|
|
||||||
"""
|
|
||||||
What is the capital of Spain ?
|
|
||||||
"""
|
|
||||||
Then embeddings are generated
|
|
||||||
|
|
||||||
Scenario: OAI Embeddings compatibility with multiple inputs
|
|
||||||
Given a model tinyllama-2
|
|
||||||
Given a prompt:
|
|
||||||
"""
|
|
||||||
In which country Paris is located ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Is Madrid the capital of Spain ?
|
|
||||||
"""
|
|
||||||
When an OAI compatible embeddings computation request for multiple inputs
|
|
||||||
Then embeddings are generated
|
|
||||||
|
|
||||||
Scenario: Tokenize / Detokenize
|
Scenario: Tokenize / Detokenize
|
||||||
When tokenizing:
|
When tokenizing:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -10,6 +10,7 @@ from contextlib import closing
|
||||||
from re import RegexFlag
|
from re import RegexFlag
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
import numpy as np
|
||||||
import openai
|
import openai
|
||||||
from behave import step
|
from behave import step
|
||||||
from behave.api.async_step import async_run_until_complete
|
from behave.api.async_step import async_run_until_complete
|
||||||
|
@ -24,6 +25,9 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
if 'PORT' in os.environ:
|
if 'PORT' in os.environ:
|
||||||
context.server_port = int(os.environ['PORT'])
|
context.server_port = int(os.environ['PORT'])
|
||||||
print(f"$PORT set, overriding server port with to {context.server_port}")
|
print(f"$PORT set, overriding server port with to {context.server_port}")
|
||||||
|
if 'FQDN' in os.environ:
|
||||||
|
context.server_fqdn = os.environ['FQDN']
|
||||||
|
print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
|
||||||
|
|
||||||
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
||||||
|
|
||||||
|
@ -34,6 +38,7 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.n_ga_w = None
|
context.n_ga_w = None
|
||||||
context.n_gpu_layer = None
|
context.n_gpu_layer = None
|
||||||
context.n_predict = None
|
context.n_predict = None
|
||||||
|
context.n_prompts = 0
|
||||||
context.n_server_predict = None
|
context.n_server_predict = None
|
||||||
context.n_slots = None
|
context.n_slots = None
|
||||||
context.prompt_prefix = None
|
context.prompt_prefix = None
|
||||||
|
@ -202,6 +207,7 @@ def step_n_tokens_predicted(context, predicted_n):
|
||||||
@step(u'a user prompt {user_prompt}')
|
@step(u'a user prompt {user_prompt}')
|
||||||
def step_user_prompt(context, user_prompt):
|
def step_user_prompt(context, user_prompt):
|
||||||
context.prompts.append(user_prompt)
|
context.prompts.append(user_prompt)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'a system prompt {system_prompt}')
|
@step(u'a system prompt {system_prompt}')
|
||||||
|
@ -290,6 +296,12 @@ def step_prompt_passkey(context):
|
||||||
context.prompt_passkey = context.text
|
context.prompt_passkey = context.text
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'{n_prompts:d} fixed prompts')
|
||||||
|
def step_fixed_prompts(context, n_prompts):
|
||||||
|
context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
|
||||||
|
context.n_prompts = n_prompts
|
||||||
|
|
||||||
|
|
||||||
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
|
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
|
||||||
def step_prompt_passkey(context, passkey, i_pos):
|
def step_prompt_passkey(context, passkey, i_pos):
|
||||||
prompt = ""
|
prompt = ""
|
||||||
|
@ -301,6 +313,7 @@ def step_prompt_passkey(context, passkey, i_pos):
|
||||||
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
||||||
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
|
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
|
||||||
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'an OAI compatible chat completions request with {api_error} api error')
|
@step(u'an OAI compatible chat completions request with {api_error} api error')
|
||||||
|
@ -341,11 +354,13 @@ async def step_oai_chat_completions(context, api_error):
|
||||||
@step(u'a prompt')
|
@step(u'a prompt')
|
||||||
def step_a_prompt(context):
|
def step_a_prompt(context):
|
||||||
context.prompts.append(context.text)
|
context.prompts.append(context.text)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'a prompt {prompt}')
|
@step(u'a prompt {prompt}')
|
||||||
def step_a_prompt_prompt(context, prompt):
|
def step_a_prompt_prompt(context, prompt):
|
||||||
context.prompts.append(prompt)
|
context.prompts.append(prompt)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'concurrent completion requests')
|
@step(u'concurrent completion requests')
|
||||||
|
@ -430,25 +445,47 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
|
||||||
@step(u'embeddings are computed for')
|
@step(u'embeddings are computed for')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_compute_embedding(context):
|
async def step_compute_embedding(context):
|
||||||
|
context.n_prompts = 1
|
||||||
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
|
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'all embeddings are the same')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_all_embeddings_are_the_same(context):
|
||||||
|
n_embedding_requests = await gather_tasks_results(context)
|
||||||
|
assert n_embedding_requests > 0
|
||||||
|
embeddings = []
|
||||||
|
for i in range(n_embedding_requests):
|
||||||
|
embedding = context.tasks_result.pop().pop()
|
||||||
|
embeddings.append(embedding)
|
||||||
|
assert_embeddings(embedding)
|
||||||
|
n = len(embeddings)
|
||||||
|
for i in range(n-1):
|
||||||
|
for j in range(i+1, n):
|
||||||
|
embedding1 = np.array(embeddings[i])
|
||||||
|
embedding2 = np.array(embeddings[j])
|
||||||
|
if context.debug:
|
||||||
|
print(f"embedding1: {embedding1[-8:]}\n")
|
||||||
|
print(f"embedding2: {embedding2[-8:]}\n")
|
||||||
|
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||||
|
msg = f"Similarity between {i} and {j}: {similarity:.10f}"
|
||||||
|
if context.debug:
|
||||||
|
print(f"{msg}\n")
|
||||||
|
assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
|
||||||
|
|
||||||
@step(u'embeddings are generated')
|
@step(u'embeddings are generated')
|
||||||
def step_assert_embeddings(context):
|
def step_assert_embeddings(context):
|
||||||
if len(context.prompts) == 0:
|
assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
|
||||||
assert_embeddings(context.embeddings)
|
f"context.n_prompts={context.n_prompts}\n"
|
||||||
else:
|
|
||||||
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
|
|
||||||
f"context.prompts={context.prompts}\n"
|
|
||||||
f"context.embeddings={context.embeddings}")
|
f"context.embeddings={context.embeddings}")
|
||||||
for embedding in context.embeddings:
|
for embedding in context.embeddings:
|
||||||
context.prompts.pop()
|
|
||||||
assert_embeddings(embedding)
|
assert_embeddings(embedding)
|
||||||
|
|
||||||
|
|
||||||
@step(u'an OAI compatible embeddings computation request for')
|
@step(u'an OAI compatible embeddings computation request for')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_oai_compute_embeddings(context):
|
async def step_oai_compute_embeddings(context):
|
||||||
|
context.n_prompts = 1
|
||||||
context.embeddings = await request_oai_embeddings(context.text,
|
context.embeddings = await request_oai_embeddings(context.text,
|
||||||
base_url=context.base_url,
|
base_url=context.base_url,
|
||||||
user_api_key=context.user_api_key,
|
user_api_key=context.user_api_key,
|
||||||
|
@ -462,6 +499,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context):
|
||||||
base_url=context.base_url,
|
base_url=context.base_url,
|
||||||
user_api_key=context.user_api_key,
|
user_api_key=context.user_api_key,
|
||||||
model=context.model)
|
model=context.model)
|
||||||
|
context.prompts.clear()
|
||||||
|
|
||||||
|
|
||||||
@step(u'concurrent embedding requests')
|
@step(u'concurrent embedding requests')
|
||||||
|
@ -488,9 +526,9 @@ async def step_concurrent_oai_embedding_requests(context):
|
||||||
@async_run_until_complete()
|
@async_run_until_complete()
|
||||||
async def all_embeddings_are_generated(context):
|
async def all_embeddings_are_generated(context):
|
||||||
n_embedding_requests = await gather_tasks_results(context)
|
n_embedding_requests = await gather_tasks_results(context)
|
||||||
assert n_embedding_requests > 0
|
assert n_embedding_requests == context.n_prompts
|
||||||
for i in range(n_embedding_requests):
|
for i in range(n_embedding_requests):
|
||||||
assert_embeddings(context.tasks_result.pop())
|
assert_embeddings(context.tasks_result.pop().pop())
|
||||||
|
|
||||||
|
|
||||||
@step(u'tokenizing')
|
@step(u'tokenizing')
|
||||||
|
@ -588,11 +626,11 @@ def step_supported_models(context, i_model, param, preposition, param_value):
|
||||||
|
|
||||||
|
|
||||||
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||||
n_prompts = len(context.prompts)
|
context.n_prompts = len(context.prompts)
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"starting {n_prompts} concurrent completion requests...")
|
print(f"starting {context.n_prompts} concurrent completion requests...")
|
||||||
assert n_prompts > 0
|
assert context.n_prompts > 0
|
||||||
for prompt_no in range(n_prompts):
|
for prompt_no in range(context.n_prompts):
|
||||||
shifted_args = [context.prompts.pop(), *args]
|
shifted_args = [context.prompts.pop(), *args]
|
||||||
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
|
@ -765,7 +803,7 @@ async def request_embedding(content, base_url=None):
|
||||||
}) as response:
|
}) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
response_json = await response.json()
|
response_json = await response.json()
|
||||||
return response_json['embedding']
|
return [response_json['embedding']]
|
||||||
|
|
||||||
|
|
||||||
async def request_oai_embeddings(input,
|
async def request_oai_embeddings(input,
|
||||||
|
@ -775,6 +813,7 @@ async def request_oai_embeddings(input,
|
||||||
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
||||||
if async_client:
|
if async_client:
|
||||||
origin = 'llama.cpp'
|
origin = 'llama.cpp'
|
||||||
|
headers=[]
|
||||||
if user_api_key is not None:
|
if user_api_key is not None:
|
||||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
|
@ -783,14 +822,21 @@ async def request_oai_embeddings(input,
|
||||||
"input": input,
|
"input": input,
|
||||||
"model": model,
|
"model": model,
|
||||||
},
|
},
|
||||||
headers=headers) as response:
|
headers=headers,
|
||||||
|
timeout=3600) as response:
|
||||||
assert response.status == 200, f"received status code not expected: {response.status}"
|
assert response.status == 200, f"received status code not expected: {response.status}"
|
||||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||||
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
||||||
response_json = await response.json()
|
response_json = await response.json()
|
||||||
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
|
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
|
||||||
assert response_json['object'] == 'list'
|
assert response_json['object'] == 'list'
|
||||||
return response_json['data']
|
if isinstance(input, collections.abc.Sequence):
|
||||||
|
embeddings = []
|
||||||
|
for an_oai_embeddings in response_json['data']:
|
||||||
|
embeddings.append(an_oai_embeddings['embedding'])
|
||||||
|
else:
|
||||||
|
embeddings = [response_json['data']['embedding']]
|
||||||
|
return embeddings
|
||||||
else:
|
else:
|
||||||
openai.api_key = user_api_key
|
openai.api_key = user_api_key
|
||||||
openai.api_base = f'{base_url}/v1'
|
openai.api_base = f'{base_url}/v1'
|
||||||
|
@ -804,7 +850,7 @@ async def request_oai_embeddings(input,
|
||||||
for an_oai_embeddings in oai_embeddings.data:
|
for an_oai_embeddings in oai_embeddings.data:
|
||||||
embeddings.append(an_oai_embeddings.embedding)
|
embeddings.append(an_oai_embeddings.embedding)
|
||||||
else:
|
else:
|
||||||
embeddings = oai_embeddings.data.embedding
|
embeddings = [oai_embeddings.data.embedding]
|
||||||
return embeddings
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
@ -899,6 +945,8 @@ def assert_embeddings(embeddings):
|
||||||
assert len(embeddings) > 0
|
assert len(embeddings) > 0
|
||||||
embeddings_computed = False
|
embeddings_computed = False
|
||||||
for emb in embeddings:
|
for emb in embeddings:
|
||||||
|
if not isinstance(emb, float):
|
||||||
|
assert False, f"Bad embeddings: {embeddings}"
|
||||||
if emb != 0:
|
if emb != 0:
|
||||||
embeddings_computed = True
|
embeddings_computed = True
|
||||||
assert embeddings_computed, f"Embeddings: {embeddings}"
|
assert embeddings_computed, f"Embeddings: {embeddings}"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
aiohttp~=3.9.3
|
aiohttp~=3.9.3
|
||||||
behave~=1.2.6
|
behave~=1.2.6
|
||||||
huggingface_hub~=0.20.3
|
huggingface_hub~=0.20.3
|
||||||
|
numpy~=1.24.4
|
||||||
openai~=0.25.0
|
openai~=0.25.0
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
|
|
|
@ -1,15 +1,16 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <string>
|
#include "llama.h"
|
||||||
#include <vector>
|
#include "common.h"
|
||||||
#include <set>
|
|
||||||
#include <mutex>
|
|
||||||
#include <condition_variable>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
#include "../llava/clip.h"
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <sstream>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
@ -37,61 +38,13 @@ extern bool server_log_json;
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
enum server_state {
|
template <typename T>
|
||||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
static T json_value(const json &body, const std::string &key, const T &default_value) {
|
||||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
// Fallback null to default value
|
||||||
SERVER_STATE_ERROR // An error occurred, load_model failed
|
return body.contains(key) && !body.at(key).is_null()
|
||||||
};
|
? body.value(key, default_value)
|
||||||
|
: default_value;
|
||||||
enum task_type {
|
}
|
||||||
TASK_TYPE_COMPLETION,
|
|
||||||
TASK_TYPE_CANCEL,
|
|
||||||
TASK_TYPE_NEXT_RESPONSE,
|
|
||||||
TASK_TYPE_METRICS
|
|
||||||
};
|
|
||||||
|
|
||||||
struct task_server {
|
|
||||||
int id = -1; // to be filled by llama_server_queue
|
|
||||||
int target_id;
|
|
||||||
task_type type;
|
|
||||||
json data;
|
|
||||||
bool infill_mode = false;
|
|
||||||
bool embedding_mode = false;
|
|
||||||
int multitask_id = -1;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct task_result {
|
|
||||||
int id;
|
|
||||||
int multitask_id = -1;
|
|
||||||
bool stop;
|
|
||||||
bool error;
|
|
||||||
json result_json;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct task_multi {
|
|
||||||
int id;
|
|
||||||
std::set<int> subtasks_remaining{};
|
|
||||||
std::vector<task_result> results{};
|
|
||||||
};
|
|
||||||
|
|
||||||
// completion token output with probabilities
|
|
||||||
struct completion_token_output {
|
|
||||||
struct token_prob
|
|
||||||
{
|
|
||||||
llama_token tok;
|
|
||||||
float prob;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<token_prob> probs;
|
|
||||||
llama_token tok;
|
|
||||||
std::string text_to_send;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct token_translator {
|
|
||||||
llama_context * ctx;
|
|
||||||
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
|
||||||
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
|
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
|
||||||
std::stringstream ss_tid;
|
std::stringstream ss_tid;
|
||||||
|
@ -102,18 +55,18 @@ static inline void server_log(const char *level, const char *function, int line,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (server_log_json) {
|
if (server_log_json) {
|
||||||
log.merge_patch(
|
log.merge_patch( {
|
||||||
{
|
|
||||||
{"level", level},
|
{"level", level},
|
||||||
{"function", function},
|
{"function", function},
|
||||||
{"line", line},
|
{"line", line},
|
||||||
{"msg", message},
|
{"msg", message},
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!extra.empty()) {
|
if (!extra.empty()) {
|
||||||
log.merge_patch(extra);
|
log.merge_patch(extra);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
|
printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
|
||||||
} else {
|
} else {
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
||||||
|
@ -136,22 +89,13 @@ static inline void server_log(const char *level, const char *function, int line,
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// server utils
|
// chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static T json_value(const json &body, const std::string &key, const T &default_value) {
|
|
||||||
// Fallback null to default value
|
|
||||||
return body.contains(key) && !body.at(key).is_null()
|
|
||||||
? body.value(key, default_value)
|
|
||||||
: default_value;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
inline bool verify_custom_template(const std::string & tmpl) {
|
inline bool verify_custom_template(const std::string & tmpl) {
|
||||||
llama_chat_message chat[] = {{"user", "test"}};
|
llama_chat_message chat[] = {{"user", "test"}};
|
||||||
std::vector<char> buf(1);
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
|
|
||||||
return res >= 0;
|
return res >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -163,7 +107,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
||||||
std::vector<llama_chat_message> chat(messages.size());
|
std::vector<llama_chat_message> chat(messages.size());
|
||||||
|
|
||||||
for (size_t i = 0; i < messages.size(); ++i) {
|
for (size_t i = 0; i < messages.size(); ++i) {
|
||||||
auto &curr_msg = messages[i];
|
const auto & curr_msg = messages[i];
|
||||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
||||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
||||||
alloc_size += str[i*2 + 1].length();
|
alloc_size += str[i*2 + 1].length();
|
||||||
|
@ -183,261 +127,13 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
||||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string formatted_chat(buf.data(), res);
|
const std::string formatted_chat(buf.data(), res);
|
||||||
|
|
||||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||||
|
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// work queue utils
|
|
||||||
//
|
|
||||||
|
|
||||||
struct llama_server_queue {
|
|
||||||
int id = 0;
|
|
||||||
std::mutex mutex_tasks;
|
|
||||||
bool running;
|
|
||||||
// queues
|
|
||||||
std::vector<task_server> queue_tasks;
|
|
||||||
std::vector<task_server> queue_tasks_deferred;
|
|
||||||
std::vector<task_multi> queue_multitasks;
|
|
||||||
std::condition_variable condition_tasks;
|
|
||||||
// callback functions
|
|
||||||
std::function<void(task_server&)> callback_new_task;
|
|
||||||
std::function<void(task_multi&)> callback_finish_multitask;
|
|
||||||
std::function<void(void)> callback_run_slots;
|
|
||||||
|
|
||||||
// Add a new task to the end of the queue
|
|
||||||
int post(task_server task) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
if (task.id == -1) {
|
|
||||||
task.id = id++;
|
|
||||||
LOG_VERBOSE("new task id", {{"new_id", task.id}});
|
|
||||||
}
|
|
||||||
queue_tasks.push_back(std::move(task));
|
|
||||||
condition_tasks.notify_one();
|
|
||||||
return task.id;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add a new task, but defer until one slot is available
|
|
||||||
void defer(task_server task) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
queue_tasks_deferred.push_back(std::move(task));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the next id for creating anew task
|
|
||||||
int get_new_id() {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
int new_id = id++;
|
|
||||||
LOG_VERBOSE("new task id", {{"new_id", new_id}});
|
|
||||||
return new_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register function to process a new task
|
|
||||||
void on_new_task(std::function<void(task_server&)> callback) {
|
|
||||||
callback_new_task = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register function to process a multitask when it is finished
|
|
||||||
void on_finish_multitask(std::function<void(task_multi&)> callback) {
|
|
||||||
callback_finish_multitask = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register the function to be called when all slots data is ready to be processed
|
|
||||||
void on_run_slots(std::function<void(void)> callback) {
|
|
||||||
callback_run_slots = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Call when the state of one slot is changed
|
|
||||||
void notify_slot_changed() {
|
|
||||||
// move deferred tasks back to main loop
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
for (auto & task : queue_tasks_deferred) {
|
|
||||||
queue_tasks.push_back(std::move(task));
|
|
||||||
}
|
|
||||||
queue_tasks_deferred.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
// end the start_loop routine
|
|
||||||
void terminate() {
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
running = false;
|
|
||||||
}
|
|
||||||
condition_tasks.notify_all();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Main loop consists of these steps:
|
|
||||||
* - Wait until a new task arrives
|
|
||||||
* - Process the task (i.e. maybe copy data into slot)
|
|
||||||
* - Check if multitask is finished
|
|
||||||
* - Run all slots
|
|
||||||
*/
|
|
||||||
void start_loop() {
|
|
||||||
running = true;
|
|
||||||
while (true) {
|
|
||||||
LOG_VERBOSE("new task may arrive", {});
|
|
||||||
{
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
if (queue_tasks.empty()) {
|
|
||||||
lock.unlock();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
task_server task = queue_tasks.front();
|
|
||||||
queue_tasks.erase(queue_tasks.begin());
|
|
||||||
lock.unlock();
|
|
||||||
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
|
|
||||||
callback_new_task(task);
|
|
||||||
}
|
|
||||||
LOG_VERBOSE("update_multitasks", {});
|
|
||||||
// check if we have any finished multitasks
|
|
||||||
auto queue_iterator = queue_multitasks.begin();
|
|
||||||
while (queue_iterator != queue_multitasks.end())
|
|
||||||
{
|
|
||||||
if (queue_iterator->subtasks_remaining.empty())
|
|
||||||
{
|
|
||||||
// all subtasks done == multitask is done
|
|
||||||
task_multi current_multitask = *queue_iterator;
|
|
||||||
callback_finish_multitask(current_multitask);
|
|
||||||
// remove this multitask
|
|
||||||
queue_iterator = queue_multitasks.erase(queue_iterator);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
++queue_iterator;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// all tasks in the current loop is processed, slots data is now ready
|
|
||||||
LOG_VERBOSE("callback_run_slots", {});
|
|
||||||
callback_run_slots();
|
|
||||||
}
|
|
||||||
LOG_VERBOSE("wait for new task", {});
|
|
||||||
// wait for new task
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
if (queue_tasks.empty()) {
|
|
||||||
if (!running) {
|
|
||||||
LOG_VERBOSE("ending start_loop", {});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
condition_tasks.wait(lock, [&]{
|
|
||||||
return (!queue_tasks.empty() || !running);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// functions to manage multitasks
|
|
||||||
//
|
|
||||||
|
|
||||||
// add a multitask by specifying the id of all subtask (subtask is a task_server)
|
|
||||||
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
|
|
||||||
{
|
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
|
||||||
task_multi multi;
|
|
||||||
multi.id = multitask_id;
|
|
||||||
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
|
||||||
queue_multitasks.push_back(multi);
|
|
||||||
}
|
|
||||||
|
|
||||||
// updatethe remaining subtasks, while appending results to multitask
|
|
||||||
void update_multitask(int multitask_id, int subtask_id, task_result& result)
|
|
||||||
{
|
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
|
||||||
for (auto& multitask : queue_multitasks)
|
|
||||||
{
|
|
||||||
if (multitask.id == multitask_id)
|
|
||||||
{
|
|
||||||
multitask.subtasks_remaining.erase(subtask_id);
|
|
||||||
multitask.results.push_back(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_server_response {
|
|
||||||
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
|
|
||||||
callback_multitask_t callback_update_multitask;
|
|
||||||
// for keeping track of all tasks waiting for the result
|
|
||||||
std::set<int> waiting_task_ids;
|
|
||||||
// the main result queue
|
|
||||||
std::vector<task_result> queue_results;
|
|
||||||
std::mutex mutex_results;
|
|
||||||
std::condition_variable condition_results;
|
|
||||||
|
|
||||||
// add the task_id to the list of tasks waiting for response
|
|
||||||
void add_waiting_task_id(int task_id) {
|
|
||||||
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
waiting_task_ids.insert(task_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// when the request is finished, we can remove task associated with it
|
|
||||||
void remove_waiting_task_id(int task_id) {
|
|
||||||
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
waiting_task_ids.erase(task_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This function blocks the thread until there is a response for this task_id
|
|
||||||
task_result recv(int task_id) {
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
condition_results.wait(lock, [&]{
|
|
||||||
return !queue_results.empty();
|
|
||||||
});
|
|
||||||
|
|
||||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
|
||||||
{
|
|
||||||
if (queue_results[i].id == task_id)
|
|
||||||
{
|
|
||||||
assert(queue_results[i].multitask_id == -1);
|
|
||||||
task_result res = queue_results[i];
|
|
||||||
queue_results.erase(queue_results.begin() + i);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// should never reach here
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register the function to update multitask
|
|
||||||
void on_multitask_update(callback_multitask_t callback) {
|
|
||||||
callback_update_multitask = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Send a new result to a waiting task_id
|
|
||||||
void send(task_result result) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
LOG_VERBOSE("send new result", {{"task_id", result.id}});
|
|
||||||
for (auto& task_id : waiting_task_ids) {
|
|
||||||
// LOG_TEE("waiting task id %i \n", task_id);
|
|
||||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
|
||||||
if (result.multitask_id == task_id)
|
|
||||||
{
|
|
||||||
LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
|
|
||||||
callback_update_multitask(task_id, result.id, result);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result.id == task_id)
|
|
||||||
{
|
|
||||||
LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
|
|
||||||
queue_results.push_back(result);
|
|
||||||
condition_results.notify_all();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// base64 utils (TODO: move to common in the future)
|
// base64 utils (TODO: move to common in the future)
|
||||||
//
|
//
|
||||||
|
@ -447,13 +143,11 @@ static const std::string base64_chars =
|
||||||
"abcdefghijklmnopqrstuvwxyz"
|
"abcdefghijklmnopqrstuvwxyz"
|
||||||
"0123456789+/";
|
"0123456789+/";
|
||||||
|
|
||||||
static inline bool is_base64(uint8_t c)
|
static inline bool is_base64(uint8_t c) {
|
||||||
{
|
|
||||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
|
||||||
{
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int j = 0;
|
int j = 0;
|
||||||
int in_ = 0;
|
int in_ = 0;
|
||||||
|
@ -465,13 +159,10 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||||
|
|
||||||
std::vector<uint8_t> ret;
|
std::vector<uint8_t> ret;
|
||||||
|
|
||||||
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
|
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
|
||||||
{
|
|
||||||
char_array_4[i++] = encoded_string[in_]; in_++;
|
char_array_4[i++] = encoded_string[in_]; in_++;
|
||||||
if (i == 4)
|
if (i == 4) {
|
||||||
{
|
for (i = 0; i < 4; i++) {
|
||||||
for (i = 0; i <4; i++)
|
|
||||||
{
|
|
||||||
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -479,23 +170,20 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||||
|
|
||||||
for (i = 0; (i < 3); i++)
|
for (i = 0; (i < 3); i++) {
|
||||||
{
|
|
||||||
ret.push_back(char_array_3[i]);
|
ret.push_back(char_array_3[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i)
|
if (i) {
|
||||||
{
|
for (j = i; j < 4; j++) {
|
||||||
for (j = i; j <4; j++)
|
|
||||||
{
|
|
||||||
char_array_4[j] = 0;
|
char_array_4[j] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 0; j <4; j++)
|
for (j = 0; j < 4; j++) {
|
||||||
{
|
|
||||||
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -503,8 +191,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||||
|
|
||||||
for (j = 0; (j < i - 1); j++)
|
for (j = 0; j < i - 1; j++) {
|
||||||
{
|
|
||||||
ret.push_back(char_array_3[j]);
|
ret.push_back(char_array_3[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -516,8 +203,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||||
// random string / id
|
// random string / id
|
||||||
//
|
//
|
||||||
|
|
||||||
static std::string random_string()
|
static std::string random_string() {
|
||||||
{
|
|
||||||
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||||
|
|
||||||
std::random_device rd;
|
std::random_device rd;
|
||||||
|
@ -532,10 +218,10 @@ static std::string random_string()
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string gen_chatcmplid()
|
static std::string gen_chatcmplid() {
|
||||||
{
|
|
||||||
std::stringstream chatcmplid;
|
std::stringstream chatcmplid;
|
||||||
chatcmplid << "chatcmpl-" << random_string();
|
chatcmplid << "chatcmpl-" << random_string();
|
||||||
|
|
||||||
return chatcmplid.str();
|
return chatcmplid.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -543,91 +229,316 @@ static std::string gen_chatcmplid()
|
||||||
// other common utils
|
// other common utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
||||||
{
|
|
||||||
size_t i;
|
size_t i;
|
||||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||||
{
|
|
||||||
}
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ends_with(const std::string &str, const std::string &suffix)
|
static bool ends_with(const std::string & str, const std::string & suffix) {
|
||||||
{
|
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
||||||
return str.size() >= suffix.size() &&
|
|
||||||
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t find_partial_stop_string(const std::string &stop,
|
static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
|
||||||
const std::string &text)
|
if (!text.empty() && !stop.empty()) {
|
||||||
{
|
|
||||||
if (!text.empty() && !stop.empty())
|
|
||||||
{
|
|
||||||
const char text_last_char = text.back();
|
const char text_last_char = text.back();
|
||||||
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
|
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
|
||||||
{
|
if (stop[char_index] == text_last_char) {
|
||||||
if (stop[char_index] == text_last_char)
|
|
||||||
{
|
|
||||||
const std::string current_partial = stop.substr(0, char_index + 1);
|
const std::string current_partial = stop.substr(0, char_index + 1);
|
||||||
if (ends_with(text, current_partial))
|
if (ends_with(text, current_partial)) {
|
||||||
{
|
|
||||||
return text.size() - char_index - 1;
|
return text.size() - char_index - 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::string::npos;
|
return std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: reuse llama_detokenize
|
// TODO: reuse llama_detokenize
|
||||||
template <class Iter>
|
template <class Iter>
|
||||||
static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
{
|
|
||||||
std::string ret;
|
std::string ret;
|
||||||
for (; begin != end; ++begin)
|
for (; begin != end; ++begin) {
|
||||||
{
|
|
||||||
ret += llama_token_to_piece(ctx, *begin);
|
ret += llama_token_to_piece(ctx, *begin);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
// format incomplete utf-8 multibyte character for output
|
// format incomplete utf-8 multibyte character for output
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
||||||
{
|
|
||||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||||
|
|
||||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
// (size > 1 meaning it's already a known token)
|
// (size > 1 meaning it's already a known token)
|
||||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
|
||||||
{
|
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << std::hex << (out[0] & 0xff);
|
ss << std::hex << (out[0] & 0xff);
|
||||||
std::string res(ss.str());
|
std::string res(ss.str());
|
||||||
out = "byte: \\x" + res;
|
out = "byte: \\x" + res;
|
||||||
}
|
}
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct completion_token_output {
|
||||||
|
llama_token tok;
|
||||||
|
std::string text_to_send;
|
||||||
|
|
||||||
|
struct token_prob {
|
||||||
|
llama_token tok;
|
||||||
|
float prob;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<token_prob> probs;
|
||||||
|
};
|
||||||
|
|
||||||
// convert a vector of completion_token_output to json
|
// convert a vector of completion_token_output to json
|
||||||
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
|
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
|
||||||
{
|
|
||||||
json out = json::array();
|
json out = json::array();
|
||||||
for (const auto &prob : probs)
|
|
||||||
{
|
for (const auto & prob : probs) {
|
||||||
json probs_for_token = json::array();
|
json probs_for_token = json::array();
|
||||||
for (const auto &p : prob.probs)
|
|
||||||
{
|
for (const auto & p : prob.probs) {
|
||||||
std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
||||||
probs_for_token.push_back(json
|
probs_for_token.push_back(json {
|
||||||
{
|
|
||||||
{"tok_str", tok_str},
|
{"tok_str", tok_str},
|
||||||
{"prob", p.prob},
|
{"prob", p.prob},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
|
||||||
|
const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
||||||
out.push_back(json {
|
out.push_back(json {
|
||||||
{"content", tok_str},
|
{"content", tok_str},
|
||||||
{"probs", probs_for_token},
|
{"probs", probs_for_token},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// OAI utils
|
||||||
|
//
|
||||||
|
|
||||||
|
static json oaicompat_completion_params_parse(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const json & body, /* openai api json semantics */
|
||||||
|
const std::string & chat_template) {
|
||||||
|
json llama_params;
|
||||||
|
|
||||||
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
|
// Map OpenAI parameters to llama.cpp parameters
|
||||||
|
//
|
||||||
|
// For parameters that are defined by the OpenAI documentation (e.g.
|
||||||
|
// temperature), we explicitly specify OpenAI's intended default; we
|
||||||
|
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
||||||
|
//
|
||||||
|
// https://platform.openai.com/docs/api-reference/chat/create
|
||||||
|
llama_sampling_params default_sparams;
|
||||||
|
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||||
|
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
|
||||||
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
|
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||||
|
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||||
|
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||||
|
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||||
|
llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
|
||||||
|
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||||
|
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||||
|
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||||
|
llama_params["stream"] = json_value(body, "stream", false);
|
||||||
|
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
|
||||||
|
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
|
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
|
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||||
|
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
|
||||||
|
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
||||||
|
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||||
|
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||||
|
|
||||||
|
if (body.count("grammar") != 0) {
|
||||||
|
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle 'stop' field
|
||||||
|
if (body.contains("stop") && body["stop"].is_string()) {
|
||||||
|
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
||||||
|
} else {
|
||||||
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure there is ChatML-specific end sequence among stop words
|
||||||
|
llama_params["stop"].push_back("<|im_end|>");
|
||||||
|
|
||||||
|
return llama_params;
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_final_response_oaicompat(const json & request, json result, bool streaming = false) {
|
||||||
|
bool stopped_word = result.count("stopped_word") != 0;
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||||
|
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason = "length";
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
|
||||||
|
json choices =
|
||||||
|
streaming ? json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}})
|
||||||
|
: json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"message", json{{"content", content},
|
||||||
|
{"role", "assistant"}}}}});
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json res = json {
|
||||||
|
{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"model",
|
||||||
|
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||||
|
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
||||||
|
{"usage", json {
|
||||||
|
{"completion_tokens", num_tokens_predicted},
|
||||||
|
{"prompt_tokens", num_prompt_tokens},
|
||||||
|
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
|
||||||
|
}},
|
||||||
|
{"id", gen_chatcmplid()}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (server_verbose) {
|
||||||
|
res["__verbose"] = result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.contains("completion_probabilities")) {
|
||||||
|
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// return value is vector as there is one case where we might need to generate two responses
|
||||||
|
static std::vector<json> format_partial_response_oaicompat(json result) {
|
||||||
|
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||||
|
return std::vector<json>({result});
|
||||||
|
}
|
||||||
|
|
||||||
|
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
||||||
|
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||||
|
|
||||||
|
bool stopped_word = json_value(result, "stopped_word", false);
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
bool stopped_limit = json_value(result, "stopped_limit", false);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason;
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
if (stopped_limit) {
|
||||||
|
finish_reason = "length";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json choices;
|
||||||
|
|
||||||
|
if (!finish_reason.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}});
|
||||||
|
} else {
|
||||||
|
if (first) {
|
||||||
|
if (content.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{{"role", "assistant"}}}}});
|
||||||
|
} else {
|
||||||
|
// We have to send this as two updates to conform to openai behavior
|
||||||
|
json initial_ret = json{{"choices", json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"role", "assistant"}
|
||||||
|
}}}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
json second_ret = json{
|
||||||
|
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"content", content}}}
|
||||||
|
}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
return std::vector<json>({initial_ret, second_ret});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||||
|
// with empty content, we ignore these at the calee site.
|
||||||
|
if (content.empty()) {
|
||||||
|
return std::vector<json>({json::object()});
|
||||||
|
}
|
||||||
|
|
||||||
|
choices = json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta",
|
||||||
|
json{
|
||||||
|
{"content", content},
|
||||||
|
}},
|
||||||
|
}});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json ret = json {
|
||||||
|
{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}
|
||||||
|
};
|
||||||
|
|
||||||
|
return std::vector<json>({ret});
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
||||||
|
json res = json {
|
||||||
|
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||||
|
{"object", "list"},
|
||||||
|
{"usage", json {
|
||||||
|
{"prompt_tokens", 0},
|
||||||
|
{"total_tokens", 0}
|
||||||
|
}},
|
||||||
|
{"data", embeddings}
|
||||||
|
};
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
||||||
|
return json {
|
||||||
|
{"tokens", tokens}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_detokenized_response(const std::string & content) {
|
||||||
|
return json {
|
||||||
|
{"content", content}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
|
@ -464,8 +464,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: not tested
|
// NOTE: not tested
|
||||||
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||||
int8x16_t res;
|
uint8x16_t res;
|
||||||
|
|
||||||
res[ 0] = a[b[ 0]];
|
res[ 0] = a[b[ 0]];
|
||||||
res[ 1] = a[b[ 1]];
|
res[ 1] = a[b[ 1]];
|
||||||
|
|
|
@ -3769,8 +3769,42 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
|
||||||
std::ofstream logfile;
|
std::ofstream logfile;
|
||||||
logfile.open(filename);
|
logfile.open(filename);
|
||||||
for(size_t i=0; i<total_elements; i++){
|
for(size_t i=0; i<total_elements; i++){
|
||||||
|
logfile << local_buf[i] <<" ";
|
||||||
|
if((i+1)%20 ==0) logfile <<std::endl;
|
||||||
|
}
|
||||||
|
logfile <<std::endl;
|
||||||
|
logfile.close();
|
||||||
|
|
||||||
|
if(src_on_device) ggml_sycl_host_free(local_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
void log_ggml_var_device_fp16(const char*name, sycl::half *src, size_t total_elements, bool src_on_device){
|
||||||
|
if(!g_ggml_sycl_debug) return;
|
||||||
|
if(!src){
|
||||||
|
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
char filename[1024];
|
||||||
|
sprintf(filename, "%s.txt", name);
|
||||||
|
printf("GGML Tensor:%s save to %s\n", name, filename);
|
||||||
|
|
||||||
|
size_t total_size = total_elements*sizeof(sycl::half);
|
||||||
|
sycl::half *local_buf = NULL;
|
||||||
|
if(src_on_device) {
|
||||||
|
local_buf = (sycl::half *) ggml_sycl_host_malloc(total_size);
|
||||||
|
ggml_sycl_set_device(g_main_device);
|
||||||
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
||||||
|
main_stream->memcpy(local_buf, src, total_size).wait();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
local_buf = (sycl::half *)src;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ofstream logfile;
|
||||||
|
logfile.open(filename);
|
||||||
|
for(size_t i=0; i<total_elements; i++){
|
||||||
|
logfile << local_buf[i] <<" ";
|
||||||
if((i+1)%20 ==0) logfile <<std::endl;
|
if((i+1)%20 ==0) logfile <<std::endl;
|
||||||
else logfile << local_buf[i] <<" ";
|
|
||||||
}
|
}
|
||||||
logfile <<std::endl;
|
logfile <<std::endl;
|
||||||
logfile.close();
|
logfile.close();
|
||||||
|
@ -14126,7 +14160,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||||
dpct::library_data_t::real_half)));
|
dpct::library_data_t::real_half)));
|
||||||
|
g_sycl_handles[id]->wait();
|
||||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
||||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||||
}
|
}
|
||||||
|
@ -14159,6 +14193,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
dpct::get_value(&alpha, *g_sycl_handles[id]), src0_ddf_i, ne00,
|
dpct::get_value(&alpha, *g_sycl_handles[id]), src0_ddf_i, ne00,
|
||||||
src1_ddf1_i, ne10, dpct::get_value(&beta, *g_sycl_handles[id]),
|
src1_ddf1_i, ne10, dpct::get_value(&beta, *g_sycl_handles[id]),
|
||||||
dst_dd_i, ldc)));
|
dst_dd_i, ldc)));
|
||||||
|
g_sycl_handles[id]->wait();
|
||||||
}
|
}
|
||||||
(void) dst;
|
(void) dst;
|
||||||
(void) src1_ddq_i;
|
(void) src1_ddq_i;
|
||||||
|
@ -15295,8 +15330,8 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
sycl_pool_alloc<sycl::half> dst_f16;
|
sycl_pool_alloc<sycl::half> dst_f16;
|
||||||
char * dst_t;
|
char * dst_t;
|
||||||
|
|
||||||
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_half;
|
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
|
||||||
dpct::library_data_t cu_data_type = dpct::library_data_t::real_half;
|
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
|
||||||
|
|
||||||
// dst strides
|
// dst strides
|
||||||
size_t nbd2 = dst->nb[2];
|
size_t nbd2 = dst->nb[2];
|
||||||
|
@ -15308,15 +15343,13 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
const float alpha_f32 = 1.0f;
|
const float alpha_f32 = 1.0f;
|
||||||
const float beta_f32 = 0.0f;
|
const float beta_f32 = 0.0f;
|
||||||
|
|
||||||
const void * alpha = &alpha_f16;
|
const void * alpha = &alpha_f32;
|
||||||
const void * beta = &beta_f16;
|
const void * beta = &beta_f32;
|
||||||
|
|
||||||
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
||||||
// once oneMKL open source supports half, half, float, float: datatypes
|
// oneMKL open source supports half, half, float, float: datatypes
|
||||||
dst_t = (char *) dst_f16.alloc(ne_dst);
|
|
||||||
|
|
||||||
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
dst_t = (char *) dst_ddf;
|
||||||
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
|
||||||
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
|
@ -15356,6 +15389,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
nb11 / nb10, nb12 / nb10, beta,
|
nb11 / nb10, nb12 / nb10, beta,
|
||||||
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
||||||
ne12 * ne13, cu_compute_type)));
|
ne12 * ne13, cu_compute_type)));
|
||||||
|
g_sycl_handles[g_main_device]->wait();
|
||||||
} else {
|
} else {
|
||||||
const int ne23 = ne12*ne13;
|
const int ne23 = ne12*ne13;
|
||||||
|
|
||||||
|
@ -15386,7 +15420,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
nb02, nb03, nb12_scaled, nb13_scaled,
|
nb02, nb03, nb12_scaled, nb13_scaled,
|
||||||
nbd2, nbd3, r2, r3, item_ct1);
|
nbd2, nbd3, r2, r3, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
}).wait();
|
||||||
}
|
}
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||||
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
||||||
|
@ -15397,11 +15431,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||||
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
||||||
cu_compute_type)));
|
cu_compute_type)));
|
||||||
|
g_sycl_handles[g_main_device]->wait();
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
|
||||||
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
|
3
ggml.c
3
ggml.c
|
@ -2158,6 +2158,9 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
||||||
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
||||||
#else
|
#else
|
||||||
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
||||||
|
# if !defined(SYS_getcpu) && defined(SYS_get_cpu)
|
||||||
|
# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
|
||||||
|
# endif
|
||||||
getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node);
|
getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -14146,18 +14146,22 @@ LLAMA_API int32_t llama_chat_apply_template(
|
||||||
curr_tmpl = std::string(model_template.data(), model_template.size());
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// format the chat to string
|
// format the chat to string
|
||||||
std::vector<const llama_chat_message *> chat_vec;
|
std::vector<const llama_chat_message *> chat_vec;
|
||||||
chat_vec.resize(n_msg);
|
chat_vec.resize(n_msg);
|
||||||
for (size_t i = 0; i < n_msg; i++) {
|
for (size_t i = 0; i < n_msg; i++) {
|
||||||
chat_vec[i] = &chat[i];
|
chat_vec[i] = &chat[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string formatted_chat;
|
std::string formatted_chat;
|
||||||
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
if (buf && length > 0) {
|
||||||
strncpy(buf, formatted_chat.c_str(), length);
|
strncpy(buf, formatted_chat.c_str(), length);
|
||||||
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ except ImportError as e:
|
||||||
KEY_PROPERTIES = [
|
KEY_PROPERTIES = [
|
||||||
"cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
|
"cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
|
||||||
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
|
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
|
||||||
"type_k", "type_v", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
|
"type_k", "type_v", "no_kv_offload", "tensor_split", "n_prompt", "n_gen"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Properties that are boolean and are converted to Yes/No for the table:
|
# Properties that are boolean and are converted to Yes/No for the table:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue