Merge remote-tracking branch 'origin/master' into json-type
This commit is contained in:
commit
7caa7b9e83
38 changed files with 2203 additions and 232 deletions
|
@ -30,8 +30,10 @@ RUN make -j$(nproc) llama-server
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -20,10 +20,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -43,8 +43,10 @@ ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
|
@ -5,15 +5,11 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
# Install build tools
|
# Install build tools
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
# Install Vulkan SDK
|
# Install Vulkan SDK and cURL
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
apt update -y && \
|
apt update -y && \
|
||||||
apt-get install -y vulkan-sdk
|
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||||
|
|
||||||
# Install cURL
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
@ -28,4 +24,6 @@ RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
@ -22,4 +22,6 @@ COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
4
.github/workflows/docker.yml
vendored
4
.github/workflows/docker.yml
vendored
|
@ -10,7 +10,7 @@
|
||||||
name: Publish Docker image
|
name: Publish Docker image
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
#pull_request:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
@ -22,7 +22,7 @@ concurrency:
|
||||||
jobs:
|
jobs:
|
||||||
push_to_registry:
|
push_to_registry:
|
||||||
name: Push Docker image to Docker Hub
|
name: Push Docker image to Docker Hub
|
||||||
if: github.event.pull_request.draft == false
|
#if: github.event.pull_request.draft == false
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
|
|
|
@ -1263,11 +1263,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// cvector params
|
// cvector params
|
||||||
if (arg == "--completions-file") {
|
|
||||||
CHECK_ARG
|
|
||||||
params.cvector_completions_file = argv[i];
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (arg == "--positive-file") {
|
if (arg == "--positive-file") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.cvector_positive_file = argv[i];
|
params.cvector_positive_file = argv[i];
|
||||||
|
@ -1278,11 +1273,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.cvector_negative_file = argv[i];
|
params.cvector_negative_file = argv[i];
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--completions") {
|
|
||||||
CHECK_ARG
|
|
||||||
params.n_completions = std::stoi(argv[i]);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (arg == "--pca-batch") {
|
if (arg == "--pca-batch") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.n_pca_batch = std::stoi(argv[i]);
|
params.n_pca_batch = std::stoi(argv[i]);
|
||||||
|
@ -1293,6 +1283,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.n_pca_iterations = std::stoi(argv[i]);
|
params.n_pca_iterations = std::stoi(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--method") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
||||||
|
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
||||||
|
else { invalid_param = true; }
|
||||||
|
return true;
|
||||||
|
}
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
// Parse args for logging parameters
|
// Parse args for logging parameters
|
||||||
if (log_param_single_parse(argv[i])) {
|
if (log_param_single_parse(argv[i])) {
|
||||||
|
@ -1444,7 +1442,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
||||||
"negative prompt file to use for guidance" });
|
"negative prompt file to use for guidance" });
|
||||||
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
||||||
|
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
||||||
|
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
||||||
|
"only commonly used templates are accepted:\n"
|
||||||
|
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
||||||
options.push_back({ "grammar" });
|
options.push_back({ "grammar" });
|
||||||
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
||||||
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
||||||
|
@ -1538,9 +1539,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
|
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
|
||||||
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
|
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
|
||||||
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
|
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
|
||||||
options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
|
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
||||||
|
"note: this argument can be repeated to add multiple control vectors" });
|
||||||
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
||||||
"add a control vector with user defined scaling SCALE" });
|
"add a control vector with user defined scaling SCALE\n"
|
||||||
|
"note: this argument can be repeated to add multiple scaled control vectors" });
|
||||||
options.push_back({ "*", " --control-vector-layer-range START END",
|
options.push_back({ "*", " --control-vector-layer-range START END",
|
||||||
"layer range to apply the control vector(s) to, start and end inclusive" });
|
"layer range to apply the control vector(s) to, start and end inclusive" });
|
||||||
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
||||||
|
@ -1621,11 +1624,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
||||||
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
||||||
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
||||||
options.push_back({ "cvector", " --completions-file FNAME",
|
|
||||||
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
|
|
||||||
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
|
|
||||||
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
||||||
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
||||||
|
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
|
||||||
|
@ -2602,12 +2603,67 @@ bool llama_should_add_bos_token(const llama_model * model) {
|
||||||
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Chat template utils
|
||||||
|
//
|
||||||
|
|
||||||
bool llama_chat_verify_template(const std::string & tmpl) {
|
bool llama_chat_verify_template(const std::string & tmpl) {
|
||||||
llama_chat_message chat[] = {{"user", "test"}};
|
llama_chat_message chat[] = {{"user", "test"}};
|
||||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||||
return res >= 0;
|
return res >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & msgs,
|
||||||
|
bool add_ass) {
|
||||||
|
int alloc_size = 0;
|
||||||
|
std::vector<llama_chat_message> chat;
|
||||||
|
for (auto & msg : msgs) {
|
||||||
|
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||||
|
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||||
|
std::vector<char> buf(alloc_size);
|
||||||
|
|
||||||
|
// run the first time to get the total output length
|
||||||
|
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
|
|
||||||
|
// if it turns out that our buffer is too small, we resize it
|
||||||
|
if ((size_t) res > buf.size()) {
|
||||||
|
buf.resize(res);
|
||||||
|
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string formatted_chat(buf.data(), res);
|
||||||
|
return formatted_chat;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
|
const llama_chat_msg & new_msg,
|
||||||
|
bool add_ass) {
|
||||||
|
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||||
|
std::vector<llama_chat_msg> chat_new(past_msg);
|
||||||
|
chat_new.push_back(new_msg);
|
||||||
|
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
||||||
|
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||||
|
return formatted;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
|
const std::string & tmpl) {
|
||||||
|
std::vector<llama_chat_msg> msgs = {
|
||||||
|
{"system", "You are a helpful assistant"},
|
||||||
|
{"user", "Hello"},
|
||||||
|
{"assistant", "Hi there"},
|
||||||
|
{"user", "How are you?"},
|
||||||
|
};
|
||||||
|
return llama_chat_apply_template(model, tmpl, msgs, true);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
enum dimre_method {
|
||||||
|
DIMRE_METHOD_PCA,
|
||||||
|
DIMRE_METHOD_MEAN,
|
||||||
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
|
@ -238,13 +244,12 @@ struct gpt_params {
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
// cvector-generator params
|
// cvector-generator params
|
||||||
int n_completions = 64;
|
int n_pca_batch = 100;
|
||||||
int n_pca_batch = 20;
|
|
||||||
int n_pca_iterations = 1000;
|
int n_pca_iterations = 1000;
|
||||||
std::string cvector_outfile = "control_vector.gguf";
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
|
@ -365,9 +370,32 @@ bool llama_should_add_bos_token(const llama_model * model);
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// same with llama_chat_message, but uses std::string
|
||||||
|
struct llama_chat_msg {
|
||||||
|
std::string role;
|
||||||
|
std::string content;
|
||||||
|
};
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool llama_chat_verify_template(const std::string & tmpl);
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
|
// CPP wrapper for llama_chat_apply_template
|
||||||
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & chat,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
|
std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
|
const llama_chat_msg & new_msg,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Returns an example of formatted chat
|
||||||
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
|
const std::string & tmpl);
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
||||||
|
class string_view {
|
||||||
|
const std::string & _str;
|
||||||
|
const size_t _start;
|
||||||
|
const size_t _end;
|
||||||
|
public:
|
||||||
|
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
||||||
|
|
||||||
|
size_t size() const {
|
||||||
|
return _end - _start;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t length() const {
|
||||||
|
return size();
|
||||||
|
}
|
||||||
|
|
||||||
|
operator std::string() const {
|
||||||
|
return str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string str() const {
|
||||||
|
return _str.substr(_start, _end - _start);
|
||||||
|
}
|
||||||
|
|
||||||
|
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
||||||
|
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
||||||
|
}
|
||||||
|
|
||||||
|
char operator[](size_t pos) const {
|
||||||
|
auto index = _start + pos;
|
||||||
|
if (index >= _end) {
|
||||||
|
throw std::out_of_range("string_view index out of range");
|
||||||
|
}
|
||||||
|
return _str[_start + pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const string_view & other) const {
|
||||||
|
std::string this_str = *this;
|
||||||
|
std::string other_str = other;
|
||||||
|
return this_str == other_str;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||||
|
auto has_min = min_value != std::numeric_limits<int>::min();
|
||||||
|
auto has_max = max_value != std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
auto digit_range = [&](char from, char to) {
|
||||||
|
out << "[";
|
||||||
|
if (from == to) {
|
||||||
|
out << from;
|
||||||
|
} else {
|
||||||
|
out << from << "-" << to;
|
||||||
|
}
|
||||||
|
out << "]";
|
||||||
|
};
|
||||||
|
auto more_digits = [&](int min_digits, int max_digits) {
|
||||||
|
out << "[0-9]";
|
||||||
|
if (min_digits == max_digits && min_digits == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
out << "{";
|
||||||
|
out << min_digits;
|
||||||
|
if (max_digits != min_digits) {
|
||||||
|
out << ",";
|
||||||
|
if (max_digits != std::numeric_limits<int>::max()) {
|
||||||
|
out << max_digits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out << "}";
|
||||||
|
};
|
||||||
|
std::function<void(const string_view &, const string_view &)> uniform_range =
|
||||||
|
[&](const string_view & from, const string_view & to) {
|
||||||
|
size_t i = 0;
|
||||||
|
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (i > 0) {
|
||||||
|
out << "\"" << from.substr(0, i).str() << "\"";
|
||||||
|
}
|
||||||
|
if (i < from.length() && i < to.length()) {
|
||||||
|
if (i > 0) {
|
||||||
|
out << " ";
|
||||||
|
}
|
||||||
|
auto sub_len = from.length() - i - 1;
|
||||||
|
if (sub_len > 0) {
|
||||||
|
auto from_sub = from.substr(i + 1);
|
||||||
|
auto to_sub = to.substr(i + 1);
|
||||||
|
auto sub_zeros = repeat("0", sub_len);
|
||||||
|
auto sub_nines = repeat("9", sub_len);
|
||||||
|
|
||||||
|
auto to_reached = false;
|
||||||
|
out << "(";
|
||||||
|
if (from_sub == sub_zeros) {
|
||||||
|
digit_range(from[i], to[i] - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(sub_len, sub_len);
|
||||||
|
} else {
|
||||||
|
out << "[" << from[i] << "] ";
|
||||||
|
out << "(";
|
||||||
|
uniform_range(from_sub, sub_nines);
|
||||||
|
out << ")";
|
||||||
|
if (from[i] < to[i] - 1) {
|
||||||
|
out << " | ";
|
||||||
|
if (to_sub == sub_nines) {
|
||||||
|
digit_range(from[i] + 1, to[i]);
|
||||||
|
to_reached = true;
|
||||||
|
} else {
|
||||||
|
digit_range(from[i] + 1, to[i] - 1);
|
||||||
|
}
|
||||||
|
out << " ";
|
||||||
|
more_digits(sub_len, sub_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!to_reached) {
|
||||||
|
out << " | ";
|
||||||
|
digit_range(to[i], to[i]);
|
||||||
|
out << " ";
|
||||||
|
uniform_range(sub_zeros, to_sub);
|
||||||
|
}
|
||||||
|
out << ")";
|
||||||
|
} else {
|
||||||
|
out << "[" << from[i] << "-" << to[i] << "]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (has_min && has_max) {
|
||||||
|
if (min_value < 0 && max_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
out << ")";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
out << ") | ";
|
||||||
|
min_value = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto min_s = std::to_string(min_value);
|
||||||
|
auto max_s = std::to_string(max_value);
|
||||||
|
auto min_digits = min_s.length();
|
||||||
|
auto max_digits = max_s.length();
|
||||||
|
|
||||||
|
for (auto digits = min_digits; digits < max_digits; digits++) {
|
||||||
|
uniform_range(min_s, repeat("9", digits));
|
||||||
|
min_s = "1" + repeat("0", digits);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
uniform_range(min_s, max_s);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto less_decimals = std::max(decimals_left - 1, 1);
|
||||||
|
|
||||||
|
if (has_min) {
|
||||||
|
if (min_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
||||||
|
out << ") | [0] | [1-9] ";
|
||||||
|
more_digits(0, decimals_left - 1);
|
||||||
|
} else if (min_value == 0) {
|
||||||
|
if (top_level) {
|
||||||
|
out << "[0] | [1-9] ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
} else {
|
||||||
|
more_digits(1, decimals_left);
|
||||||
|
}
|
||||||
|
} else if (min_value <= 9) {
|
||||||
|
char c = '0' + min_value;
|
||||||
|
auto range_start = top_level ? '1' : '0';
|
||||||
|
if (c > range_start) {
|
||||||
|
digit_range(range_start, c - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(1, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
digit_range(c, '9');
|
||||||
|
out << " ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
} else {
|
||||||
|
auto min_s = std::to_string(min_value);
|
||||||
|
auto len = min_s.length();
|
||||||
|
auto c = min_s[0];
|
||||||
|
|
||||||
|
if (c > '1') {
|
||||||
|
digit_range(top_level ? '1' : '0', c - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(len, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
digit_range(c, c);
|
||||||
|
out << " (";
|
||||||
|
_build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
|
||||||
|
out << ")";
|
||||||
|
if (c < '9') {
|
||||||
|
out << " | ";
|
||||||
|
digit_range(c + 1, '9');
|
||||||
|
out << " ";
|
||||||
|
more_digits(len - 1, less_decimals);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has_max) {
|
||||||
|
if (max_value >= 0) {
|
||||||
|
if (top_level) {
|
||||||
|
out << "\"-\" [1-9] ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
} else {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
|
||||||
|
out << ")";
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||||
|
}
|
||||||
|
|
||||||
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||||
|
|
||||||
struct BuiltinRule {
|
struct BuiltinRule {
|
||||||
|
@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
|
||||||
return "\"" + escaped + "\"";
|
return "\"" + escaped + "\"";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class SchemaConverter {
|
class SchemaConverter {
|
||||||
private:
|
private:
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
|
@ -688,6 +914,24 @@ public:
|
||||||
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||||
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||||
|
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||||
|
int min_value = std::numeric_limits<int>::min();
|
||||||
|
int max_value = std::numeric_limits<int>::max();
|
||||||
|
if (schema.contains("minimum")) {
|
||||||
|
min_value = schema["minimum"].get<int>();
|
||||||
|
} else if (schema.contains("exclusiveMinimum")) {
|
||||||
|
min_value = schema["exclusiveMinimum"].get<int>() + 1;
|
||||||
|
}
|
||||||
|
if (schema.contains("maximum")) {
|
||||||
|
max_value = schema["maximum"].get<int>();
|
||||||
|
} else if (schema.contains("exclusiveMaximum")) {
|
||||||
|
max_value = schema["exclusiveMaximum"].get<int>() - 1;
|
||||||
|
}
|
||||||
|
std::stringstream out;
|
||||||
|
out << "(";
|
||||||
|
_build_min_max_int(min_value, max_value, out);
|
||||||
|
out << ") space";
|
||||||
|
return _add_rule(rule_name, out.str());
|
||||||
} else if (schema.empty() || schema_type == "object") {
|
} else if (schema.empty() || schema_type == "object") {
|
||||||
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
||||||
|
|
||||||
result->grammar = llama_grammar_init(
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
result->grammar = grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
result->prev.resize(params.n_prev);
|
||||||
|
@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
if (!ctx->parsed_grammar.rules.empty()) {
|
if (!ctx->parsed_grammar.rules.empty()) {
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
||||||
|
|
||||||
ctx->grammar = llama_grammar_init(
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
ctx->grammar = grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||||
|
|
|
@ -11,13 +11,16 @@ Related PRs:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# CPU only
|
# CPU only
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf
|
||||||
|
|
||||||
# With GPU
|
# With GPU
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
# With advanced options
|
# With advanced options
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
|
||||||
|
|
||||||
|
# Using mean value instead of PCA
|
||||||
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
|
||||||
|
|
||||||
# To see help message
|
# To see help message
|
||||||
./cvector-generator -h
|
./cvector-generator -h
|
||||||
|
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
|
||||||
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
||||||
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example to use output file with `llama-cli`:
|
||||||
|
|
||||||
|
(Tips: The control vector works better when apply to layers higher than 10)
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
|
||||||
|
```
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "pca.hpp"
|
#include "pca.hpp"
|
||||||
|
#include "mean.hpp"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
||||||
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
||||||
|
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,23 +225,30 @@ struct train_context {
|
||||||
|
|
||||||
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
||||||
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
||||||
void build_v_diff() {
|
void build_v_diff(bool transpose) {
|
||||||
printf("build_v_diff\n");
|
printf("build_v_diff\n");
|
||||||
for (int il = 0; il < n_layers - 1; il++) {
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
auto & diff_tmp = v_diff_tmp[il];
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
int n_elem = diff_tmp.size() / sizeof(float);
|
int n_elem = diff_tmp.size() / sizeof(float);
|
||||||
GGML_ASSERT(n_elem % n_embd == 0);
|
GGML_ASSERT(n_elem % n_embd == 0);
|
||||||
int n_rows = n_elem / n_embd;
|
int n_rows = n_elem / n_embd;
|
||||||
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
|
struct ggml_tensor * diff = transpose
|
||||||
|
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
|
||||||
|
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
|
||||||
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
||||||
// copy data & transpose
|
|
||||||
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
||||||
float * arr = (float *) diff_tmp.data();
|
if (transpose) {
|
||||||
for (int ir = 0; ir < n_rows; ++ir) {
|
// copy data & transpose
|
||||||
for (int ic = 0; ic < n_embd; ++ic) {
|
float * arr = (float *) diff_tmp.data();
|
||||||
float f = arr[ir*n_embd + ic];
|
for (int ir = 0; ir < n_rows; ++ir) {
|
||||||
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
for (int ic = 0; ic < n_embd; ++ic) {
|
||||||
|
float f = arr[ir*n_embd + ic];
|
||||||
|
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// only copy
|
||||||
|
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
|
||||||
}
|
}
|
||||||
v_diff.push_back(diff);
|
v_diff.push_back(diff);
|
||||||
print_debug_tensor(diff);
|
print_debug_tensor(diff);
|
||||||
|
@ -263,8 +272,8 @@ struct tokenized_prompt {
|
||||||
|
|
||||||
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
||||||
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
||||||
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
padding_seq(ctx, tokens_pos, max_seq_len);
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||||
padding_seq(ctx, tokens_neg, max_seq_len);
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||||
|
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
fprintf(stderr, "must provide at least one prompt pair\n");
|
fprintf(stderr, "must provide at least one prompt pair\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
ctx_train.positive_entries = positive_prompts;
|
||||||
// create templated prompts
|
ctx_train.negative_entries = negative_prompts;
|
||||||
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
|
||||||
auto format_template = [](std::string persona, std::string suffix) {
|
|
||||||
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
|
|
||||||
return persona + suffix;
|
|
||||||
};
|
|
||||||
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
|
||||||
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
|
||||||
// TODO replicate the truncations done by the python implementation
|
|
||||||
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
|
|
||||||
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
// prepare ctx_train for PCA
|
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
|
||||||
ctx_train.build_v_diff();
|
|
||||||
|
|
||||||
// run PCA
|
// prepare ctx_train for PCA
|
||||||
PCA::pca_params pca_params;
|
ctx_train.build_v_diff(use_pca);
|
||||||
pca_params.n_threads = params.n_threads;
|
|
||||||
pca_params.n_batch = params.n_pca_batch;
|
if (use_pca) {
|
||||||
pca_params.n_iterations = params.n_pca_iterations;
|
// run PCA
|
||||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
PCA::pca_params pca_params;
|
||||||
|
pca_params.n_threads = params.n_threads;
|
||||||
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
} else {
|
||||||
|
// run mean
|
||||||
|
mean::run(ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
}
|
||||||
|
|
||||||
// write output vectors to gguf
|
// write output vectors to gguf
|
||||||
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||||
|
|
48
examples/cvector-generator/mean.hpp
Normal file
48
examples/cvector-generator/mean.hpp
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
namespace mean {
|
||||||
|
|
||||||
|
static void run(
|
||||||
|
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
|
||||||
|
const std::vector<struct ggml_tensor *> & v_output) {
|
||||||
|
printf("%s: Running mean...\n", __func__);
|
||||||
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||||
|
// prepare output vector
|
||||||
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
|
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
||||||
|
|
||||||
|
// calculate mean vector
|
||||||
|
struct ggml_tensor * t_layer = v_input[il];
|
||||||
|
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
|
||||||
|
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
|
||||||
|
float f = 0.0;
|
||||||
|
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
|
||||||
|
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
|
||||||
|
}
|
||||||
|
f /= t_layer->ne[1];
|
||||||
|
ggml_set_f32_1d(ctrl_out, ic, f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize output vector
|
||||||
|
float norm = 0.0;
|
||||||
|
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||||
|
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||||
|
norm += f*f;
|
||||||
|
}
|
||||||
|
norm = sqrt(norm);
|
||||||
|
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||||
|
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||||
|
ggml_set_f32_1d(ctrl_out, i, f / norm);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1 +1,4 @@
|
||||||
[INST] Act like a person who is extremely sad. [/INST]
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
|
@ -290,7 +290,7 @@ static void power_iteration(
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
||||||
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
|
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
// get output tensor
|
// get output tensor
|
||||||
|
@ -298,6 +298,9 @@ static void power_iteration(
|
||||||
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
||||||
//print_debug_tensor(output);
|
//print_debug_tensor(output);
|
||||||
ggml_gallocr_free(allocr);
|
ggml_gallocr_free(allocr);
|
||||||
|
|
||||||
|
// TODO @ngxson : The output vector is randomly inverted
|
||||||
|
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
|
||||||
}
|
}
|
||||||
|
|
||||||
static void run_pca(
|
static void run_pca(
|
||||||
|
|
|
@ -1 +1,4 @@
|
||||||
[INST] Act like a person who is extremely happy. [/INST]
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
|
|
@ -101,7 +101,9 @@ int main(int argc, char** argv) {
|
||||||
auto grammar = llama_grammar_init(
|
auto grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
// Read the input file
|
// Read the input file
|
||||||
std::string input_str;
|
std::string input_str;
|
||||||
{
|
{
|
||||||
|
|
|
@ -53,6 +53,7 @@ if __name__ == '__main__':
|
||||||
question: str
|
question: str
|
||||||
concise_answer: str
|
concise_answer: str
|
||||||
justification: str
|
justification: str
|
||||||
|
stars: Annotated[int, Field(ge=1, le=5)]
|
||||||
|
|
||||||
class PyramidalSummary(BaseModel):
|
class PyramidalSummary(BaseModel):
|
||||||
title: str
|
title: str
|
||||||
|
|
|
@ -4,7 +4,7 @@ import itertools
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Dict, List, Set, Tuple, Union
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
|
|
||||||
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
||||||
|
@ -23,6 +23,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
||||||
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
|
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
|
||||||
return f'({result})?' if min_items == 0 else result
|
return f'({result})?' if min_items == 0 else result
|
||||||
|
|
||||||
|
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
|
||||||
|
has_min = min_value != None
|
||||||
|
has_max = max_value != None
|
||||||
|
|
||||||
|
def digit_range(from_char: str, to_char: str):
|
||||||
|
out.append("[")
|
||||||
|
if from_char == to_char:
|
||||||
|
out.append(from_char)
|
||||||
|
else:
|
||||||
|
out.append(from_char)
|
||||||
|
out.append("-")
|
||||||
|
out.append(to_char)
|
||||||
|
out.append("]")
|
||||||
|
|
||||||
|
def more_digits(min_digits: int, max_digits: int):
|
||||||
|
out.append("[0-9]")
|
||||||
|
if min_digits == max_digits and min_digits == 1:
|
||||||
|
return
|
||||||
|
out.append("{")
|
||||||
|
out.append(str(min_digits))
|
||||||
|
if max_digits != min_digits:
|
||||||
|
out.append(",")
|
||||||
|
if max_digits != sys.maxsize:
|
||||||
|
out.append(str(max_digits))
|
||||||
|
out.append("}")
|
||||||
|
|
||||||
|
def uniform_range(from_str: str, to_str: str):
|
||||||
|
i = 0
|
||||||
|
while i < len(from_str) and from_str[i] == to_str[i]:
|
||||||
|
i += 1
|
||||||
|
if i > 0:
|
||||||
|
out.append("\"")
|
||||||
|
out.append(from_str[:i])
|
||||||
|
out.append("\"")
|
||||||
|
if i < len(from_str):
|
||||||
|
if i > 0:
|
||||||
|
out.append(" ")
|
||||||
|
sub_len = len(from_str) - i - 1
|
||||||
|
if sub_len > 0:
|
||||||
|
from_sub = from_str[i+1:]
|
||||||
|
to_sub = to_str[i+1:]
|
||||||
|
sub_zeros = "0" * sub_len
|
||||||
|
sub_nines = "9" * sub_len
|
||||||
|
|
||||||
|
to_reached = False
|
||||||
|
out.append("(")
|
||||||
|
if from_sub == sub_zeros:
|
||||||
|
digit_range(from_str[i], chr(ord(to_str[i]) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(sub_len, sub_len)
|
||||||
|
else:
|
||||||
|
out.append("[")
|
||||||
|
out.append(from_str[i])
|
||||||
|
out.append("] ")
|
||||||
|
out.append("(")
|
||||||
|
uniform_range(from_sub, sub_nines)
|
||||||
|
out.append(")")
|
||||||
|
if ord(from_str[i]) < ord(to_str[i]) - 1:
|
||||||
|
out.append(" | ")
|
||||||
|
if to_sub == sub_nines:
|
||||||
|
digit_range(chr(ord(from_str[i]) + 1), to_str[i])
|
||||||
|
to_reached = True
|
||||||
|
else:
|
||||||
|
digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(sub_len, sub_len)
|
||||||
|
if not to_reached:
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(to_str[i], to_str[i])
|
||||||
|
out.append(" ")
|
||||||
|
uniform_range(sub_zeros, to_sub)
|
||||||
|
out.append(")")
|
||||||
|
else:
|
||||||
|
out.append("[")
|
||||||
|
out.append(from_str[i])
|
||||||
|
out.append("-")
|
||||||
|
out.append(to_str[i])
|
||||||
|
out.append("]")
|
||||||
|
|
||||||
|
if has_min and has_max:
|
||||||
|
if min_value < 0 and max_value < 0:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
|
||||||
|
out.append(")")
|
||||||
|
return
|
||||||
|
|
||||||
|
if min_value < 0:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
|
||||||
|
out.append(") | ")
|
||||||
|
min_value = 0
|
||||||
|
|
||||||
|
min_s = str(min_value)
|
||||||
|
max_s = str(max_value)
|
||||||
|
min_digits = len(min_s)
|
||||||
|
max_digits = len(max_s)
|
||||||
|
|
||||||
|
for digits in range(min_digits, max_digits):
|
||||||
|
uniform_range(min_s, "9" * digits)
|
||||||
|
min_s = "1" + "0" * digits
|
||||||
|
out.append(" | ")
|
||||||
|
uniform_range(min_s, max_s)
|
||||||
|
return
|
||||||
|
|
||||||
|
less_decimals = max(decimals_left - 1, 1)
|
||||||
|
|
||||||
|
if has_min:
|
||||||
|
if min_value < 0:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
|
||||||
|
out.append(") | [0] | [1-9] ")
|
||||||
|
more_digits(0, decimals_left - 1)
|
||||||
|
elif min_value == 0:
|
||||||
|
if top_level:
|
||||||
|
out.append("[0] | [1-9] ")
|
||||||
|
more_digits(0, less_decimals)
|
||||||
|
else:
|
||||||
|
more_digits(1, decimals_left)
|
||||||
|
elif min_value <= 9:
|
||||||
|
c = str(min_value)
|
||||||
|
range_start = '1' if top_level else '0'
|
||||||
|
if c > range_start:
|
||||||
|
digit_range(range_start, chr(ord(c) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(1, less_decimals)
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(c, "9")
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(0, less_decimals)
|
||||||
|
else:
|
||||||
|
min_s = str(min_value)
|
||||||
|
length = len(min_s)
|
||||||
|
c = min_s[0]
|
||||||
|
|
||||||
|
if c > "1":
|
||||||
|
digit_range("1" if top_level else "0", chr(ord(c) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(length, less_decimals)
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(c, c)
|
||||||
|
out.append(" (")
|
||||||
|
_generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
|
||||||
|
out.append(")")
|
||||||
|
if c < "9":
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(chr(ord(c) + 1), "9")
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(length - 1, less_decimals)
|
||||||
|
return
|
||||||
|
|
||||||
|
if has_max:
|
||||||
|
if max_value >= 0:
|
||||||
|
if top_level:
|
||||||
|
out.append("\"-\" [1-9] ")
|
||||||
|
more_digits(0, less_decimals)
|
||||||
|
out.append(" | ")
|
||||||
|
_generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
|
||||||
|
else:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
|
||||||
|
out.append(")")
|
||||||
|
return
|
||||||
|
|
||||||
|
raise RuntimeError("At least one of min_value or max_value must be set")
|
||||||
|
|
||||||
class BuiltinRule:
|
class BuiltinRule:
|
||||||
def __init__(self, content: str, deps: list = None):
|
def __init__(self, content: str, deps: list = None):
|
||||||
|
@ -432,6 +596,24 @@ class SchemaConverter:
|
||||||
|
|
||||||
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
|
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
|
||||||
|
|
||||||
|
elif schema_type in (None, 'integer') and \
|
||||||
|
('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
|
||||||
|
min_value = None
|
||||||
|
max_value = None
|
||||||
|
if 'minimum' in schema:
|
||||||
|
min_value = schema['minimum']
|
||||||
|
elif 'exclusiveMinimum' in schema:
|
||||||
|
min_value = schema['exclusiveMinimum'] + 1
|
||||||
|
if 'maximum' in schema:
|
||||||
|
max_value = schema['maximum']
|
||||||
|
elif 'exclusiveMaximum' in schema:
|
||||||
|
max_value = schema['exclusiveMaximum'] - 1
|
||||||
|
|
||||||
|
out = ["("]
|
||||||
|
_generate_min_max_int(min_value, max_value, out)
|
||||||
|
out.append(") space")
|
||||||
|
return self._add_rule(rule_name, ''.join(out))
|
||||||
|
|
||||||
elif (schema_type == 'object') or (len(schema) == 0):
|
elif (schema_type == 'object') or (len(schema) == 0):
|
||||||
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
|
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
|
||||||
|
|
||||||
|
|
|
@ -39,12 +39,12 @@ static std::ostringstream * g_output_ss;
|
||||||
static std::vector<llama_token> * g_output_tokens;
|
static std::vector<llama_token> * g_output_tokens;
|
||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
|
|
||||||
static bool file_exists(const std::string &path) {
|
static bool file_exists(const std::string & path) {
|
||||||
std::ifstream f(path.c_str());
|
std::ifstream f(path.c_str());
|
||||||
return f.good();
|
return f.good();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool file_is_empty(const std::string &path) {
|
static bool file_is_empty(const std::string & path) {
|
||||||
std::ifstream f;
|
std::ifstream f;
|
||||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||||
|
@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
||||||
LOG_TEE("%s", text);
|
LOG_TEE("%s", text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
||||||
|
llama_chat_msg new_msg{role, content};
|
||||||
|
auto formatted = llama_chat_format_single(
|
||||||
|
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
|
chat_msgs.push_back({role, content});
|
||||||
|
return formatted;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
@ -190,6 +198,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
llama_context * ctx_guidance = NULL;
|
llama_context * ctx_guidance = NULL;
|
||||||
|
std::vector<llama_chat_msg> chat_msgs;
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
||||||
|
@ -215,6 +224,8 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
{
|
||||||
LOG("tokenize the prompt\n");
|
auto prompt = params.conversation
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||||
} else {
|
: params.prompt;
|
||||||
LOG("use session tokens\n");
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
embd_inp = session_tokens;
|
LOG("tokenize the prompt\n");
|
||||||
}
|
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||||
|
} else {
|
||||||
|
LOG("use session tokens\n");
|
||||||
|
embd_inp = session_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
|
@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||||
|
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
|
||||||
|
|
||||||
// the first thing we will do is to output the prompt, so set color accordingly
|
// the first thing we will do is to output the prompt, so set color accordingly
|
||||||
console::set_display(console::prompt);
|
console::set_display(console::prompt);
|
||||||
|
@ -793,11 +810,18 @@ int main(int argc, char ** argv) {
|
||||||
is_antiprompt = true;
|
is_antiprompt = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if current token is not EOG, we add it to current assistant message
|
||||||
|
if (params.conversation) {
|
||||||
|
auto id = llama_sampling_last(ctx_sampling);
|
||||||
|
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||||
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
|
@ -848,8 +872,12 @@ int main(int argc, char ** argv) {
|
||||||
string_process_escapes(buffer);
|
string_process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string user_inp = params.conversation
|
||||||
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||||
|
: std::move(buffer);
|
||||||
|
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||||
|
@ -864,6 +892,9 @@ int main(int argc, char ** argv) {
|
||||||
output_ss << llama_token_to_piece(ctx, token);
|
output_ss << llama_token_to_piece(ctx, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reset assistant message
|
||||||
|
assistant_ss.str("");
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
||||||
return minItems === 0 ? `(${result})?` : result;
|
return minItems === 0 ? `(${result})?` : result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
|
||||||
|
const hasMin = minValue !== null;
|
||||||
|
const hasMax = maxValue !== null;
|
||||||
|
|
||||||
|
function digitRange(fromChar, toChar) {
|
||||||
|
out.push("[");
|
||||||
|
if (fromChar === toChar) {
|
||||||
|
out.push(fromChar);
|
||||||
|
} else {
|
||||||
|
out.push(fromChar);
|
||||||
|
out.push("-");
|
||||||
|
out.push(toChar);
|
||||||
|
}
|
||||||
|
out.push("]");
|
||||||
|
}
|
||||||
|
|
||||||
|
function moreDigits(minDigits, maxDigits) {
|
||||||
|
out.push("[0-9]");
|
||||||
|
if (minDigits === maxDigits && minDigits === 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
out.push("{");
|
||||||
|
out.push(minDigits.toString());
|
||||||
|
if (maxDigits !== minDigits) {
|
||||||
|
out.push(",");
|
||||||
|
if (maxDigits !== Number.MAX_SAFE_INTEGER) {
|
||||||
|
out.push(maxDigits.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.push("}");
|
||||||
|
}
|
||||||
|
|
||||||
|
function uniformRange(fromStr, toStr) {
|
||||||
|
let i = 0;
|
||||||
|
while (i < fromStr.length && fromStr[i] === toStr[i]) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (i > 0) {
|
||||||
|
out.push("\"");
|
||||||
|
out.push(fromStr.slice(0, i));
|
||||||
|
out.push("\"");
|
||||||
|
}
|
||||||
|
if (i < fromStr.length) {
|
||||||
|
if (i > 0) {
|
||||||
|
out.push(" ");
|
||||||
|
}
|
||||||
|
const subLen = fromStr.length - i - 1;
|
||||||
|
if (subLen > 0) {
|
||||||
|
const fromSub = fromStr.slice(i + 1);
|
||||||
|
const toSub = toStr.slice(i + 1);
|
||||||
|
const subZeros = "0".repeat(subLen);
|
||||||
|
const subNines = "9".repeat(subLen);
|
||||||
|
|
||||||
|
let toReached = false;
|
||||||
|
out.push("(");
|
||||||
|
if (fromSub === subZeros) {
|
||||||
|
digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(subLen, subLen);
|
||||||
|
} else {
|
||||||
|
out.push("[");
|
||||||
|
out.push(fromStr[i]);
|
||||||
|
out.push("] ");
|
||||||
|
out.push("(");
|
||||||
|
uniformRange(fromSub, subNines);
|
||||||
|
out.push(")");
|
||||||
|
if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
|
||||||
|
out.push(" | ");
|
||||||
|
if (toSub === subNines) {
|
||||||
|
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
|
||||||
|
toReached = true;
|
||||||
|
} else {
|
||||||
|
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||||
|
}
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(subLen, subLen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!toReached) {
|
||||||
|
out.push(" | ");
|
||||||
|
digitRange(toStr[i], toStr[i]);
|
||||||
|
out.push(" ");
|
||||||
|
uniformRange(subZeros, toSub);
|
||||||
|
}
|
||||||
|
out.push(")");
|
||||||
|
} else {
|
||||||
|
out.push("[");
|
||||||
|
out.push(fromStr[i]);
|
||||||
|
out.push("-");
|
||||||
|
out.push(toStr[i]);
|
||||||
|
out.push("]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasMin && hasMax) {
|
||||||
|
if (minValue < 0 && maxValue < 0) {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
|
||||||
|
out.push(")");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (minValue < 0) {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
|
||||||
|
out.push(") | ");
|
||||||
|
minValue = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let minS = minValue.toString();
|
||||||
|
const maxS = maxValue.toString();
|
||||||
|
const minDigits = minS.length;
|
||||||
|
const maxDigits = maxS.length;
|
||||||
|
|
||||||
|
for (let digits = minDigits; digits < maxDigits; digits++) {
|
||||||
|
uniformRange(minS, "9".repeat(digits));
|
||||||
|
minS = "1" + "0".repeat(digits);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
uniformRange(minS, maxS);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lessDecimals = Math.max(decimalsLeft - 1, 1);
|
||||||
|
|
||||||
|
if (hasMin) {
|
||||||
|
if (minValue < 0) {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
|
||||||
|
out.push(") | [0] | [1-9] ");
|
||||||
|
moreDigits(0, decimalsLeft - 1);
|
||||||
|
} else if (minValue === 0) {
|
||||||
|
if (topLevel) {
|
||||||
|
out.push("[0] | [1-9] ");
|
||||||
|
moreDigits(0, lessDecimals);
|
||||||
|
} else {
|
||||||
|
moreDigits(1, decimalsLeft);
|
||||||
|
}
|
||||||
|
} else if (minValue <= 9) {
|
||||||
|
const c = minValue.toString();
|
||||||
|
const range_start = topLevel ? '1' : '0';
|
||||||
|
if (c > range_start) {
|
||||||
|
digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(1, lessDecimals);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
digitRange(c, "9");
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(0, lessDecimals);
|
||||||
|
} else {
|
||||||
|
const minS = minValue.toString();
|
||||||
|
const length = minS.length;
|
||||||
|
const c = minS[0];
|
||||||
|
|
||||||
|
if (c > "1") {
|
||||||
|
digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(length, lessDecimals);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
digitRange(c, c);
|
||||||
|
out.push(" (");
|
||||||
|
_generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
|
||||||
|
out.push(")");
|
||||||
|
if (c < "9") {
|
||||||
|
out.push(" | ");
|
||||||
|
digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(length - 1, lessDecimals);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasMax) {
|
||||||
|
if (maxValue >= 0) {
|
||||||
|
if (topLevel) {
|
||||||
|
out.push("\"-\" [1-9] ");
|
||||||
|
moreDigits(0, lessDecimals);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
_generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
|
||||||
|
} else {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
|
||||||
|
out.push(")");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error("At least one of minValue or maxValue must be set");
|
||||||
|
}
|
||||||
|
|
||||||
class BuiltinRule {
|
class BuiltinRule {
|
||||||
constructor(content, deps) {
|
constructor(content, deps) {
|
||||||
this.content = content;
|
this.content = content;
|
||||||
|
@ -435,6 +630,24 @@ export class SchemaConverter {
|
||||||
const minLen = schema.minLength || 0;
|
const minLen = schema.minLength || 0;
|
||||||
const maxLen = schema.maxLength;
|
const maxLen = schema.maxLength;
|
||||||
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
|
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
|
||||||
|
} else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
|
||||||
|
let minValue = null;
|
||||||
|
let maxValue = null;
|
||||||
|
if ('minimum' in schema) {
|
||||||
|
minValue = schema.minimum;
|
||||||
|
} else if ('exclusiveMinimum' in schema) {
|
||||||
|
minValue = schema.exclusiveMinimum + 1;
|
||||||
|
}
|
||||||
|
if ('maximum' in schema) {
|
||||||
|
maxValue = schema.maximum;
|
||||||
|
} else if ('exclusiveMaximum' in schema) {
|
||||||
|
maxValue = schema.exclusiveMaximum - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const out = ["("];
|
||||||
|
_generateMinMaxInt(minValue, maxValue, out);
|
||||||
|
out.push(") space");
|
||||||
|
return this._addRule(ruleName, out.join(''));
|
||||||
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
||||||
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
|
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -3,6 +3,13 @@
|
||||||
|
|
||||||
by Humans for All.
|
by Humans for All.
|
||||||
|
|
||||||
|
## quickstart
|
||||||
|
|
||||||
|
To run from the build dir
|
||||||
|
|
||||||
|
bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
|
||||||
|
|
||||||
|
Continue reading for the details.
|
||||||
|
|
||||||
## overview
|
## overview
|
||||||
|
|
||||||
|
@ -14,6 +21,8 @@ own system prompts.
|
||||||
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
||||||
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
||||||
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
||||||
|
|
||||||
|
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
|
||||||
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
||||||
is-it-a-alpabetic|numeral-char regex match logic.
|
is-it-a-alpabetic|numeral-char regex match logic.
|
||||||
|
|
||||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
apiRequestOptions - maintains the list of options/fields to send along with api request,
|
||||||
irrespective of whether /chat/completions or /completions endpoint.
|
irrespective of whether /chat/completions or /completions endpoint.
|
||||||
|
|
||||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||||
modify the existing options value or remove them, for now you can update this global var
|
modify the existing options value or remove them, for now you can update this global var
|
||||||
using browser's development-tools/console.
|
using browser's development-tools/console.
|
||||||
|
|
||||||
For string and numeric fields in chatRequestOptions, including even those added by a user
|
For string, numeric and boolean fields in apiRequestOptions, including even those added by a
|
||||||
at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
|
user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
|
||||||
created.
|
created.
|
||||||
|
|
||||||
|
cache_prompt option supported by example/server is allowed to be controlled by user, so that
|
||||||
|
any caching supported wrt system-prompt and chat history, if usable can get used. When chat
|
||||||
|
history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
|
||||||
|
wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
|
||||||
|
However system prompt should ideally get the benefit of caching.
|
||||||
|
|
||||||
headers - maintains the list of http headers sent when request is made to the server. By default
|
headers - maintains the list of http headers sent when request is made to the server. By default
|
||||||
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
||||||
be set if needed using the settings ui.
|
be set if needed using the settings ui.
|
||||||
|
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
|
||||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||||
|
|
||||||
|
|
||||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
|
||||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
the implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||||
some extent in a simple crude way. You may also want to control the context size enabled when
|
some extent in a simple crude way. You may also want to control the context size enabled when the
|
||||||
the server loads ai-model, on the server end.
|
server loads ai-model, on the server end.
|
||||||
|
|
||||||
|
|
||||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||||
|
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
|
||||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||||
to /completions endpoint handling code on server side.
|
to /completions endpoint handling code on server side.
|
||||||
|
|
||||||
NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
|
NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
|
||||||
wrt the set of fields sent to server along with the user query. To check how the model behaves
|
wrt the set of fields sent to server along with the user query, to check how the model behaves
|
||||||
wrt repeatations in general in the generated text response.
|
wrt repeatations in general in the generated text response.
|
||||||
|
|
||||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
||||||
using the providing settings ui.
|
using the provided settings ui (for settings exposed through the ui).
|
||||||
|
|
||||||
|
|
||||||
### OpenAi / Equivalent API WebService
|
### OpenAi / Equivalent API WebService
|
||||||
|
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
|
||||||
* the baseUrl in settings ui
|
* the baseUrl in settings ui
|
||||||
* https://api.openai.com/v1 or similar
|
* https://api.openai.com/v1 or similar
|
||||||
|
|
||||||
* Wrt request body - gMe.chatRequestOptions
|
* Wrt request body - gMe.apiRequestOptions
|
||||||
* model (settings ui)
|
* model (settings ui)
|
||||||
* any additional fields if required in future
|
* any additional fields if required in future
|
||||||
|
|
||||||
|
|
|
@ -222,8 +222,8 @@ class SimpleChat {
|
||||||
* @param {Object} obj
|
* @param {Object} obj
|
||||||
*/
|
*/
|
||||||
request_jsonstr_extend(obj) {
|
request_jsonstr_extend(obj) {
|
||||||
for(let k in gMe.chatRequestOptions) {
|
for(let k in gMe.apiRequestOptions) {
|
||||||
obj[k] = gMe.chatRequestOptions[k];
|
obj[k] = gMe.apiRequestOptions[k];
|
||||||
}
|
}
|
||||||
if (gMe.bStream) {
|
if (gMe.bStream) {
|
||||||
obj["stream"] = true;
|
obj["stream"] = true;
|
||||||
|
@ -740,11 +740,12 @@ class Me {
|
||||||
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
||||||
}
|
}
|
||||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||||
this.chatRequestOptions = {
|
this.apiRequestOptions = {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"temperature": 0.7,
|
"temperature": 0.7,
|
||||||
"max_tokens": 1024,
|
"max_tokens": 1024,
|
||||||
"n_predict": 1024,
|
"n_predict": 1024,
|
||||||
|
"cache_prompt": false,
|
||||||
//"frequency_penalty": 1.2,
|
//"frequency_penalty": 1.2,
|
||||||
//"presence_penalty": 1.2,
|
//"presence_penalty": 1.2,
|
||||||
};
|
};
|
||||||
|
@ -800,51 +801,55 @@ class Me {
|
||||||
|
|
||||||
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
|
||||||
|
|
||||||
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
|
||||||
|
|
||||||
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
|
ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
|
||||||
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Auto create ui input elements for fields in ChatRequestOptions
|
* Auto create ui input elements for fields in apiRequestOptions
|
||||||
* Currently supports text and number field types.
|
* Currently supports text and number field types.
|
||||||
* @param {HTMLDivElement} elDiv
|
* @param {HTMLDivElement} elDiv
|
||||||
*/
|
*/
|
||||||
show_settings_chatrequestoptions(elDiv) {
|
show_settings_apirequestoptions(elDiv) {
|
||||||
let typeDict = {
|
let typeDict = {
|
||||||
"string": "text",
|
"string": "text",
|
||||||
"number": "number",
|
"number": "number",
|
||||||
};
|
};
|
||||||
let fs = document.createElement("fieldset");
|
let fs = document.createElement("fieldset");
|
||||||
let legend = document.createElement("legend");
|
let legend = document.createElement("legend");
|
||||||
legend.innerText = "ChatRequestOptions";
|
legend.innerText = "ApiRequestOptions";
|
||||||
fs.appendChild(legend);
|
fs.appendChild(legend);
|
||||||
elDiv.appendChild(fs);
|
elDiv.appendChild(fs);
|
||||||
for(const k in this.chatRequestOptions) {
|
for(const k in this.apiRequestOptions) {
|
||||||
let val = this.chatRequestOptions[k];
|
let val = this.apiRequestOptions[k];
|
||||||
let type = typeof(val);
|
let type = typeof(val);
|
||||||
if (!((type == "string") || (type == "number"))) {
|
if (((type == "string") || (type == "number"))) {
|
||||||
continue;
|
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
|
||||||
|
if (type == "number") {
|
||||||
|
val = Number(val);
|
||||||
|
}
|
||||||
|
this.apiRequestOptions[k] = val;
|
||||||
|
});
|
||||||
|
fs.appendChild(inp.div);
|
||||||
|
} else if (type == "boolean") {
|
||||||
|
let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
|
||||||
|
this.apiRequestOptions[k] = userVal;
|
||||||
|
});
|
||||||
|
fs.appendChild(bbtn.div);
|
||||||
}
|
}
|
||||||
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
|
|
||||||
if (type == "number") {
|
|
||||||
val = Number(val);
|
|
||||||
}
|
|
||||||
this.chatRequestOptions[k] = val;
|
|
||||||
});
|
|
||||||
fs.appendChild(inp.div);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -870,6 +875,23 @@ class Me {
|
||||||
});
|
});
|
||||||
elDiv.appendChild(bb.div);
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
|
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
||||||
|
this.bTrimGarbage = val;
|
||||||
|
});
|
||||||
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
|
this.show_settings_apirequestoptions(elDiv);
|
||||||
|
|
||||||
|
let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
||||||
|
this.apiEP = ApiEP.Type[val];
|
||||||
|
});
|
||||||
|
elDiv.appendChild(sel.div);
|
||||||
|
|
||||||
|
sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
||||||
|
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
||||||
|
});
|
||||||
|
elDiv.appendChild(sel.div);
|
||||||
|
|
||||||
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
||||||
this.bCompletionFreshChatAlways = val;
|
this.bCompletionFreshChatAlways = val;
|
||||||
});
|
});
|
||||||
|
@ -880,23 +902,6 @@ class Me {
|
||||||
});
|
});
|
||||||
elDiv.appendChild(bb.div);
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
|
||||||
this.bTrimGarbage = val;
|
|
||||||
});
|
|
||||||
elDiv.appendChild(bb.div);
|
|
||||||
|
|
||||||
let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
|
||||||
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
|
||||||
});
|
|
||||||
elDiv.appendChild(sel.div);
|
|
||||||
|
|
||||||
sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
|
||||||
this.apiEP = ApiEP.Type[val];
|
|
||||||
});
|
|
||||||
elDiv.appendChild(sel.div);
|
|
||||||
|
|
||||||
this.show_settings_chatrequestoptions(elDiv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
|
@ -2606,17 +2606,9 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// print sample chat example to make it clear which template is used
|
// print sample chat example to make it clear which template is used
|
||||||
{
|
{
|
||||||
json chat;
|
|
||||||
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
|
|
||||||
chat.push_back({{"role", "user"}, {"content", "Hello"}});
|
|
||||||
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
|
|
||||||
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
|
|
||||||
|
|
||||||
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
|
|
||||||
|
|
||||||
LOG_INFO("chat template", {
|
LOG_INFO("chat template", {
|
||||||
{"chat_example", chat_example},
|
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
||||||
{"built_in", params.chat_template.empty()},
|
{"built_in", params.chat_template.empty()},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
|
||||||
|
|
||||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||||
size_t alloc_size = 0;
|
std::vector<llama_chat_msg> chat;
|
||||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
|
||||||
std::vector<std::string> str(messages.size() * 2);
|
|
||||||
std::vector<llama_chat_message> chat(messages.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < messages.size(); ++i) {
|
for (size_t i = 0; i < messages.size(); ++i) {
|
||||||
const auto & curr_msg = messages[i];
|
const auto & curr_msg = messages[i];
|
||||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
std::string content = json_value(curr_msg, "content", std::string(""));
|
||||||
alloc_size += str[i*2 + 1].length();
|
chat.push_back({role, content});
|
||||||
chat[i].role = str[i*2 + 0].c_str();
|
|
||||||
chat[i].content = str[i*2 + 1].c_str();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||||
std::vector<char> buf(alloc_size * 2);
|
|
||||||
|
|
||||||
// run the first time to get the total output length
|
|
||||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
|
||||||
|
|
||||||
// if it turns out that our buffer is too small, we resize it
|
|
||||||
if ((size_t) res > buf.size()) {
|
|
||||||
buf.resize(res);
|
|
||||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string formatted_chat(buf.data(), res);
|
|
||||||
|
|
||||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||||
|
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1924,16 +1924,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
|
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
|
||||||
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||||
|
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
|
// KQ + KQV multi-batch without FlashAttention
|
||||||
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
||||||
} else if (use_mul_mat_vec_q) {
|
} else if (use_mul_mat_vec_q) {
|
||||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
||||||
} else if (use_mul_mat_q) {
|
} else if (use_mul_mat_q) {
|
||||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
|
||||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
|
||||||
// KQ + KQV multi-batch without FlashAttention
|
|
||||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
|
||||||
} else {
|
} else {
|
||||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4620,7 +4620,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
|
|
|
@ -69,6 +69,7 @@ class GGUFReader:
|
||||||
# I - same as host, S - swapped
|
# I - same as host, S - swapped
|
||||||
byte_order: Literal['I'] | Literal['S'] = 'I'
|
byte_order: Literal['I'] | Literal['S'] = 'I'
|
||||||
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
||||||
|
data_offset: int
|
||||||
|
|
||||||
# Note: Internal helper, API may change.
|
# Note: Internal helper, API may change.
|
||||||
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
|
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
|
||||||
|
@ -88,9 +89,13 @@ class GGUFReader:
|
||||||
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
|
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
|
||||||
self.data = np.memmap(path, mode = mode)
|
self.data = np.memmap(path, mode = mode)
|
||||||
offs = 0
|
offs = 0
|
||||||
|
|
||||||
|
# Check for GGUF magic
|
||||||
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
||||||
raise ValueError('GGUF magic invalid')
|
raise ValueError('GGUF magic invalid')
|
||||||
offs += 4
|
offs += 4
|
||||||
|
|
||||||
|
# Check GGUF version
|
||||||
temp_version = self._get(offs, np.uint32)
|
temp_version = self._get(offs, np.uint32)
|
||||||
if temp_version[0] & 65535 == 0:
|
if temp_version[0] & 65535 == 0:
|
||||||
# If we get 0 here that means it's (probably) a GGUF file created for
|
# If we get 0 here that means it's (probably) a GGUF file created for
|
||||||
|
@ -103,12 +108,16 @@ class GGUFReader:
|
||||||
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
||||||
self.tensors: list[ReaderTensor] = []
|
self.tensors: list[ReaderTensor] = []
|
||||||
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
|
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
|
||||||
|
|
||||||
|
# Check tensor count and kv count
|
||||||
temp_counts = self._get(offs, np.uint64, 2)
|
temp_counts = self._get(offs, np.uint64, 2)
|
||||||
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
|
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
|
||||||
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
|
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
|
||||||
tensor_count, kv_count = temp_counts
|
tensor_count, kv_count = temp_counts
|
||||||
offs = self._build_fields(offs, kv_count)
|
offs = self._build_fields(offs, kv_count)
|
||||||
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
|
|
||||||
|
# Build Tensor Info Fields
|
||||||
|
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
|
||||||
new_align = self.fields.get('general.alignment')
|
new_align = self.fields.get('general.alignment')
|
||||||
if new_align is not None:
|
if new_align is not None:
|
||||||
if new_align.types != [GGUFValueType.UINT32]:
|
if new_align.types != [GGUFValueType.UINT32]:
|
||||||
|
@ -117,6 +126,7 @@ class GGUFReader:
|
||||||
padding = offs % self.alignment
|
padding = offs % self.alignment
|
||||||
if padding != 0:
|
if padding != 0:
|
||||||
offs += self.alignment - padding
|
offs += self.alignment - padding
|
||||||
|
self.data_offset = offs
|
||||||
self._build_tensors(offs, tensors_fields)
|
self._build_tensors(offs, tensors_fields)
|
||||||
|
|
||||||
_DT = TypeVar('_DT', bound = npt.DTypeLike)
|
_DT = TypeVar('_DT', bound = npt.DTypeLike)
|
||||||
|
@ -193,18 +203,29 @@ class GGUFReader:
|
||||||
# We can't deal with this one.
|
# We can't deal with this one.
|
||||||
raise ValueError('Unknown/unhandled field type {gtype}')
|
raise ValueError('Unknown/unhandled field type {gtype}')
|
||||||
|
|
||||||
def _get_tensor(self, orig_offs: int) -> ReaderField:
|
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
|
||||||
offs = orig_offs
|
offs = orig_offs
|
||||||
|
|
||||||
|
# Get Tensor Name
|
||||||
name_len, name_data = self._get_str(offs)
|
name_len, name_data = self._get_str(offs)
|
||||||
offs += int(name_len.nbytes + name_data.nbytes)
|
offs += int(name_len.nbytes + name_data.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Dimensions Count
|
||||||
n_dims = self._get(offs, np.uint32)
|
n_dims = self._get(offs, np.uint32)
|
||||||
offs += int(n_dims.nbytes)
|
offs += int(n_dims.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Dimension Array
|
||||||
dims = self._get(offs, np.uint64, n_dims[0])
|
dims = self._get(offs, np.uint64, n_dims[0])
|
||||||
offs += int(dims.nbytes)
|
offs += int(dims.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Encoding Scheme Type
|
||||||
raw_dtype = self._get(offs, np.uint32)
|
raw_dtype = self._get(offs, np.uint32)
|
||||||
offs += int(raw_dtype.nbytes)
|
offs += int(raw_dtype.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Offset
|
||||||
offset_tensor = self._get(offs, np.uint64)
|
offset_tensor = self._get(offs, np.uint64)
|
||||||
offs += int(offset_tensor.nbytes)
|
offs += int(offset_tensor.nbytes)
|
||||||
|
|
||||||
return ReaderField(
|
return ReaderField(
|
||||||
orig_offs,
|
orig_offs,
|
||||||
str(bytes(name_data), encoding = 'utf-8'),
|
str(bytes(name_data), encoding = 'utf-8'),
|
||||||
|
@ -233,10 +254,10 @@ class GGUFReader:
|
||||||
offs += field_size
|
offs += field_size
|
||||||
return offs
|
return offs
|
||||||
|
|
||||||
def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
|
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
|
||||||
tensor_fields = []
|
tensor_fields = []
|
||||||
for _ in range(count):
|
for _ in range(count):
|
||||||
field = self._get_tensor(offs)
|
field = self._get_tensor_info_field(offs)
|
||||||
offs += sum(int(part.nbytes) for part in field.parts)
|
offs += sum(int(part.nbytes) for part in field.parts)
|
||||||
tensor_fields.append(field)
|
tensor_fields.append(field)
|
||||||
return offs, tensor_fields
|
return offs, tensor_fields
|
||||||
|
|
|
@ -319,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
||||||
|
|
||||||
markdown_content += "\n"
|
markdown_content += "\n"
|
||||||
|
|
||||||
|
markdown_content += "### Tensor Data Offset\n"
|
||||||
|
markdown_content += '\n'
|
||||||
|
markdown_content += 'This table contains the offset and data segment relative to start of file\n'
|
||||||
|
markdown_content += '\n'
|
||||||
|
|
||||||
|
tensor_mapping_table: list[dict[str, str | int]] = []
|
||||||
|
for key, tensor in enumerate(reader.tensors):
|
||||||
|
data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
|
||||||
|
data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
|
||||||
|
tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
|
||||||
|
|
||||||
|
tensors_mapping_table_header_map = [
|
||||||
|
{'key_name':'t_id', 'header_name':'T_ID', 'align':'right'},
|
||||||
|
{'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'},
|
||||||
|
{'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'},
|
||||||
|
{'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'},
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
|
||||||
|
markdown_content += "\n"
|
||||||
|
|
||||||
for group in tensor_prefix_order:
|
for group in tensor_prefix_order:
|
||||||
tensors = tensor_groups[group]
|
tensors = tensor_groups[group]
|
||||||
group_elements = sum(tensor.n_elements for tensor in tensors)
|
group_elements = sum(tensor.n_elements for tensor in tensors)
|
||||||
|
@ -370,6 +391,8 @@ def main() -> None:
|
||||||
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
|
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
|
||||||
parser.add_argument("--json", action="store_true", help="Produce JSON output")
|
parser.add_argument("--json", action="store_true", help="Produce JSON output")
|
||||||
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
|
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
|
||||||
|
parser.add_argument("--data-offset", action="store_true", help="Start of data offset")
|
||||||
|
parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
|
||||||
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
|
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
|
||||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||||
|
|
||||||
|
@ -377,7 +400,7 @@ def main() -> None:
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
|
||||||
if not args.json and not args.markdown:
|
if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
|
||||||
logger.info(f'* Loading: {args.model}')
|
logger.info(f'* Loading: {args.model}')
|
||||||
|
|
||||||
reader = GGUFReader(args.model, 'r')
|
reader = GGUFReader(args.model, 'r')
|
||||||
|
@ -386,6 +409,10 @@ def main() -> None:
|
||||||
dump_metadata_json(reader, args)
|
dump_metadata_json(reader, args)
|
||||||
elif args.markdown:
|
elif args.markdown:
|
||||||
dump_markdown_metadata(reader, args)
|
dump_markdown_metadata(reader, args)
|
||||||
|
elif args.data_offset:
|
||||||
|
print(reader.data_offset) # noqa: NP100
|
||||||
|
elif args.data_alignment:
|
||||||
|
print(reader.alignment) # noqa: NP100
|
||||||
else:
|
else:
|
||||||
dump_metadata(reader, args)
|
dump_metadata(reader, args)
|
||||||
|
|
||||||
|
|
626
llama.cpp
626
llama.cpp
|
@ -226,6 +226,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_ARCTIC,
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_DEEPSEEK2,
|
LLM_ARCH_DEEPSEEK2,
|
||||||
LLM_ARCH_BITNET,
|
LLM_ARCH_BITNET,
|
||||||
|
LLM_ARCH_T5,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -265,6 +266,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||||
{ LLM_ARCH_BITNET, "bitnet" },
|
{ LLM_ARCH_BITNET, "bitnet" },
|
||||||
|
{ LLM_ARCH_T5, "t5" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -297,6 +299,7 @@ enum llm_kv {
|
||||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
LLM_KV_LOGIT_SCALE,
|
LLM_KV_LOGIT_SCALE,
|
||||||
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
|
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
|
@ -309,6 +312,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_CAUSAL,
|
LLM_KV_ATTENTION_CAUSAL,
|
||||||
LLM_KV_ATTENTION_Q_LORA_RANK,
|
LLM_KV_ATTENTION_Q_LORA_RANK,
|
||||||
LLM_KV_ATTENTION_KV_LORA_RANK,
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
||||||
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||||
|
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
LLM_KV_ROPE_FREQ_BASE,
|
LLM_KV_ROPE_FREQ_BASE,
|
||||||
|
@ -346,6 +350,8 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_ADD_BOS,
|
LLM_KV_TOKENIZER_ADD_BOS,
|
||||||
LLM_KV_TOKENIZER_ADD_EOS,
|
LLM_KV_TOKENIZER_ADD_EOS,
|
||||||
LLM_KV_TOKENIZER_ADD_PREFIX,
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
||||||
|
LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
|
||||||
|
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
||||||
LLM_KV_TOKENIZER_HF_JSON,
|
LLM_KV_TOKENIZER_HF_JSON,
|
||||||
LLM_KV_TOKENIZER_RWKV,
|
LLM_KV_TOKENIZER_RWKV,
|
||||||
LLM_KV_TOKENIZER_PREFIX_ID,
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
||||||
|
@ -383,18 +389,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||||
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
|
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||||
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||||
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||||
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||||
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||||
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||||
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||||
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||||
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||||
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||||
|
|
||||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||||
|
@ -415,29 +423,31 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
||||||
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
||||||
|
|
||||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
||||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
||||||
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
||||||
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
||||||
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
||||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
||||||
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||||
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||||
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV {
|
struct LLM_KV {
|
||||||
|
@ -504,6 +514,34 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||||
LLM_TENSOR_ATTN_SUB_NORM,
|
LLM_TENSOR_ATTN_SUB_NORM,
|
||||||
LLM_TENSOR_FFN_SUB_NORM,
|
LLM_TENSOR_FFN_SUB_NORM,
|
||||||
|
LLM_TENSOR_DEC_ATTN_NORM,
|
||||||
|
LLM_TENSOR_DEC_ATTN_Q,
|
||||||
|
LLM_TENSOR_DEC_ATTN_K,
|
||||||
|
LLM_TENSOR_DEC_ATTN_V,
|
||||||
|
LLM_TENSOR_DEC_ATTN_OUT,
|
||||||
|
LLM_TENSOR_DEC_ATTN_REL_B,
|
||||||
|
LLM_TENSOR_DEC_CROSS_ATTN_NORM,
|
||||||
|
LLM_TENSOR_DEC_CROSS_ATTN_Q,
|
||||||
|
LLM_TENSOR_DEC_CROSS_ATTN_K,
|
||||||
|
LLM_TENSOR_DEC_CROSS_ATTN_V,
|
||||||
|
LLM_TENSOR_DEC_CROSS_ATTN_OUT,
|
||||||
|
LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
|
||||||
|
LLM_TENSOR_DEC_FFN_NORM,
|
||||||
|
LLM_TENSOR_DEC_FFN_GATE,
|
||||||
|
LLM_TENSOR_DEC_FFN_DOWN,
|
||||||
|
LLM_TENSOR_DEC_FFN_UP,
|
||||||
|
LLM_TENSOR_DEC_OUTPUT_NORM,
|
||||||
|
LLM_TENSOR_ENC_ATTN_NORM,
|
||||||
|
LLM_TENSOR_ENC_ATTN_Q,
|
||||||
|
LLM_TENSOR_ENC_ATTN_K,
|
||||||
|
LLM_TENSOR_ENC_ATTN_V,
|
||||||
|
LLM_TENSOR_ENC_ATTN_OUT,
|
||||||
|
LLM_TENSOR_ENC_ATTN_REL_B,
|
||||||
|
LLM_TENSOR_ENC_FFN_NORM,
|
||||||
|
LLM_TENSOR_ENC_FFN_GATE,
|
||||||
|
LLM_TENSOR_ENC_FFN_DOWN,
|
||||||
|
LLM_TENSOR_ENC_FFN_UP,
|
||||||
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
|
@ -1135,6 +1173,41 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_T5,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" },
|
||||||
|
{ LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" },
|
||||||
|
{ LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" },
|
||||||
|
{ LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" },
|
||||||
|
{ LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" },
|
||||||
|
{ LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" },
|
||||||
|
{ LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" },
|
||||||
|
{ LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" },
|
||||||
|
{ LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
|
||||||
|
{ LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
||||||
|
{ LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
|
||||||
|
{ LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
|
||||||
|
{ LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -2356,6 +2429,11 @@ struct llama_vocab {
|
||||||
bool tokenizer_add_bos = false;
|
bool tokenizer_add_bos = false;
|
||||||
bool tokenizer_add_eos = false;
|
bool tokenizer_add_eos = false;
|
||||||
bool tokenizer_ignore_merges = false;
|
bool tokenizer_ignore_merges = false;
|
||||||
|
bool tokenizer_remove_extra_whitespaces = false;
|
||||||
|
bool tokenizer_escape_whitespaces = true;
|
||||||
|
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||||
|
|
||||||
|
std::vector<char> precompiled_charsmap;
|
||||||
|
|
||||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||||
|
@ -4191,6 +4269,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||||
|
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||||
default: return "unknown";
|
default: return "unknown";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4870,6 +4949,45 @@ static void llm_load_vocab(
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = -1;
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = -1;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = -1;
|
||||||
|
} else if (tokenizer_model == "t5") {
|
||||||
|
vocab.type = LLAMA_VOCAB_TYPE_UGM;
|
||||||
|
|
||||||
|
// default special tokens
|
||||||
|
vocab.special_bos_id = -1;
|
||||||
|
vocab.special_eos_id = 1;
|
||||||
|
vocab.special_unk_id = 2;
|
||||||
|
vocab.special_sep_id = -1;
|
||||||
|
vocab.special_pad_id = 0;
|
||||||
|
vocab.special_cls_id = -1;
|
||||||
|
vocab.special_mask_id = -1;
|
||||||
|
|
||||||
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||||
|
if (add_space_prefix_keyidx != -1) {
|
||||||
|
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||||
|
} // The default value of add_space_prefix is true.
|
||||||
|
|
||||||
|
const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
|
||||||
|
if (remove_extra_whitespaces_keyidx != -1) {
|
||||||
|
vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
|
||||||
|
} // The default value of remove_extra_whitespaces is false.
|
||||||
|
|
||||||
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
|
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||||
|
const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||||
|
vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
|
||||||
|
#ifdef IS_BIG_ENDIAN
|
||||||
|
// correct endiannes of data in precompiled_charsmap binary blob
|
||||||
|
uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
|
||||||
|
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||||||
|
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
|
||||||
|
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
|
||||||
|
uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
|
||||||
|
for (size_t i = 0; i < xcda_array_size; ++i) {
|
||||||
|
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -4952,6 +5070,10 @@ static void llm_load_vocab(
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
vocab.tokenizer_add_bos = true;
|
vocab.tokenizer_add_bos = true;
|
||||||
vocab.tokenizer_add_eos = false;
|
vocab.tokenizer_add_eos = false;
|
||||||
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
vocab.tokenizer_add_bos = false;
|
||||||
|
vocab.tokenizer_add_eos = true;
|
||||||
} else {
|
} else {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
}
|
}
|
||||||
|
@ -13213,12 +13335,18 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
|
||||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool llama_is_unused_token(const llama_vocab& vocab, llama_token id) {
|
||||||
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
|
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
|
||||||
|
}
|
||||||
|
|
||||||
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||||
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
||||||
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
||||||
const auto & token_data = vocab.id_to_token.at(id);
|
const auto & token_data = vocab.id_to_token.at(id);
|
||||||
switch (llama_vocab_get_type(vocab)) {
|
switch (llama_vocab_get_type(vocab)) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
|
case LLAMA_VOCAB_TYPE_UGM: {
|
||||||
auto buf = token_data.text.substr(3, 2);
|
auto buf = token_data.text.substr(3, 2);
|
||||||
return strtol(buf.c_str(), NULL, 16);
|
return strtol(buf.c_str(), NULL, 16);
|
||||||
}
|
}
|
||||||
|
@ -13238,7 +13366,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||||
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
||||||
static const char * hex = "0123456789ABCDEF";
|
static const char * hex = "0123456789ABCDEF";
|
||||||
switch (llama_vocab_get_type(vocab)) {
|
switch (llama_vocab_get_type(vocab)) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
|
case LLAMA_VOCAB_TYPE_UGM: {
|
||||||
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||||
auto token = vocab.token_to_id.find(buf);
|
auto token = vocab.token_to_id.find(buf);
|
||||||
if (token != vocab.token_to_id.end()) {
|
if (token != vocab.token_to_id.end()) {
|
||||||
|
@ -13826,6 +13955,383 @@ struct llm_tokenizer_wpm {
|
||||||
const llama_vocab & vocab;
|
const llama_vocab & vocab;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct naive_trie {
|
||||||
|
naive_trie() : has_value(false), value(0) {
|
||||||
|
}
|
||||||
|
void insert(const char * key, size_t len, int32_t value = 0) {
|
||||||
|
if (len == 0) {
|
||||||
|
this->has_value = true;
|
||||||
|
this->value = value;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
char c = key[0];
|
||||||
|
auto res = children.find(c);
|
||||||
|
if (res != children.end()) {
|
||||||
|
res->second.insert(key + 1, len - 1, value);
|
||||||
|
} else {
|
||||||
|
auto res = children.insert(std::make_pair(c, naive_trie()));
|
||||||
|
res.first->second.insert(key + 1, len - 1, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
|
||||||
|
if (len == 0 || offset == len) {
|
||||||
|
return std::make_pair(key, offset);
|
||||||
|
}
|
||||||
|
char c = key[offset];
|
||||||
|
auto res = children.find(c);
|
||||||
|
if (res != children.end()) {
|
||||||
|
return res->second.get_longest_prefix(key, len, offset + 1);
|
||||||
|
} else {
|
||||||
|
return std::make_pair(key, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
struct naive_trie * traverse(const char c) {
|
||||||
|
auto res = children.find(c);
|
||||||
|
if (res != children.end()) {
|
||||||
|
return &res->second;
|
||||||
|
} else {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::map<char, struct naive_trie> children;
|
||||||
|
bool has_value;
|
||||||
|
llama_token value;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_tokenizer_ugm {
|
||||||
|
llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
|
||||||
|
if (vocab.precompiled_charsmap.size() > 0) {
|
||||||
|
size_t charsmap_offset = 0;
|
||||||
|
|
||||||
|
// First four bytes of precompiled_charsmap contains length of binary
|
||||||
|
// blob containing XOR-compressed compact double array (XCDA) entries
|
||||||
|
uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
|
||||||
|
charsmap_offset += sizeof(xcda_blob_size);
|
||||||
|
if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
|
||||||
|
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
|
||||||
|
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
|
||||||
|
xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
|
||||||
|
xcda_array_size = xcda_blob_size / sizeof(uint32_t);
|
||||||
|
charsmap_offset += xcda_blob_size;
|
||||||
|
|
||||||
|
// Remaining bytes of precompiled charsmap contain null-terminated
|
||||||
|
// replacement strings for prefixes matched by the XCDA.
|
||||||
|
prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
|
||||||
|
prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
|
||||||
|
const auto &token_data = vocab.id_to_token[id];
|
||||||
|
|
||||||
|
if (llama_is_normal_token(vocab, id)) {
|
||||||
|
min_score = std::min<float>(min_score, token_data.score);
|
||||||
|
max_score = std::max<float>(max_score, token_data.score);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_is_normal_token(vocab, id) ||
|
||||||
|
llama_is_user_defined_token(vocab, id) ||
|
||||||
|
llama_is_unused_token(vocab, id)) {
|
||||||
|
token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_is_user_defined_token(vocab, id)) {
|
||||||
|
user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unknown_token_score = min_score - unknown_token_score_penalty;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
|
||||||
|
* unigram language models. The general idea is to:
|
||||||
|
* - move along the input sequence in steps of one UTF code point,
|
||||||
|
* - at each step find all possible tokenizations of the prefix by
|
||||||
|
* traversing the tokens trie,
|
||||||
|
* - for each tokenization store the best one so far (by higher score)
|
||||||
|
* - use the position in sequence after given token as an index to store
|
||||||
|
* results
|
||||||
|
* - if there was no valid tokenization of the current UTF code point
|
||||||
|
* then use unknown token with additional score penalty
|
||||||
|
* After processing the whole sequence we backtrack from the end to get
|
||||||
|
* the best tokenization.
|
||||||
|
*/
|
||||||
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
|
// normalize the input first
|
||||||
|
std::string normalized;
|
||||||
|
normalize(text, &normalized);
|
||||||
|
size_t input_len = normalized.size();
|
||||||
|
|
||||||
|
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
||||||
|
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
|
||||||
|
// at the beginning tokenization score is zero
|
||||||
|
tokenization_results[0] = { 0, 0, 0 };
|
||||||
|
|
||||||
|
for (size_t input_offset = 0; input_offset < input_len;) {
|
||||||
|
size_t prefix_offset = input_offset;
|
||||||
|
// calculate how many code units are in the currently processed UTF code point
|
||||||
|
size_t n_utf8_code_units = std::min<size_t>(utf8_len(normalized[input_offset]), input_len - input_offset);
|
||||||
|
|
||||||
|
// traverse the token matcher trie to find a matching token
|
||||||
|
bool single_codepoint_token_found = false;
|
||||||
|
const struct best_tokenization & current_best = tokenization_results[input_offset];
|
||||||
|
struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
|
||||||
|
|
||||||
|
while (prefix_offset <= input_len && node != NULL) {
|
||||||
|
// check if we found valid token in prefix
|
||||||
|
if (node->has_value) {
|
||||||
|
// check if it corresponds to the whole UTF code point
|
||||||
|
if (prefix_offset - input_offset == n_utf8_code_units) {
|
||||||
|
single_codepoint_token_found = true;
|
||||||
|
}
|
||||||
|
llama_token token_id = node->value;
|
||||||
|
const auto &token_data = vocab.id_to_token[token_id];
|
||||||
|
|
||||||
|
// we set the user-defined token scores to 0 to make them more likely to be selected
|
||||||
|
// (normal token scores are log probabilities, so they are negative)
|
||||||
|
// score type is double here to make tokenization results exactly
|
||||||
|
// the same as in the HF tokenizer using SentencePiece
|
||||||
|
const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
|
||||||
|
const double challenger_score = current_best.score_sum + token_score;
|
||||||
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||||
|
if (challenger_score > current_champ.score_sum) {
|
||||||
|
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
|
||||||
|
current_champ = challenger;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
node = node->traverse(normalized[prefix_offset++]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we didn't find a valid token corresponding to the whole UTF code point
|
||||||
|
// then use unknown token as the tokenization of this UTF code point
|
||||||
|
if (!single_codepoint_token_found) {
|
||||||
|
const double challenger_score = current_best.score_sum + unknown_token_score;
|
||||||
|
prefix_offset = input_offset + n_utf8_code_units;
|
||||||
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||||
|
if (challenger_score > current_champ.score_sum) {
|
||||||
|
struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
|
||||||
|
current_champ = challenger;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// move to the next UTF code point
|
||||||
|
input_offset += n_utf8_code_units;
|
||||||
|
}
|
||||||
|
|
||||||
|
// now backtrack from the end to gather token ids of the best tokenization
|
||||||
|
// merge sequences of consecutive unknown tokens into single unknown tokens
|
||||||
|
bool is_prev_unknown = false;
|
||||||
|
for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
|
||||||
|
bool is_unknown = tokenization.token_id == vocab.special_unk_id;
|
||||||
|
if (!(is_prev_unknown && is_unknown)) {
|
||||||
|
output.push_back(tokenization.token_id);
|
||||||
|
}
|
||||||
|
if (tokenization.input_offset == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
is_prev_unknown = is_unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
// reverse the output since we added tokens starting from the end of the input
|
||||||
|
std::reverse(output.begin(), output.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const llama_vocab & vocab;
|
||||||
|
|
||||||
|
// helper structure for returning normalization results
|
||||||
|
struct normalization_result {
|
||||||
|
const char * normalized;
|
||||||
|
size_t normalized_len;
|
||||||
|
size_t consumed_input;
|
||||||
|
};
|
||||||
|
|
||||||
|
void normalize(const std::string& input, std::string * normalized) {
|
||||||
|
normalized->clear();
|
||||||
|
normalized->reserve(input.size() * 3);
|
||||||
|
|
||||||
|
const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
|
||||||
|
|
||||||
|
bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
|
||||||
|
bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
|
||||||
|
bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
|
||||||
|
|
||||||
|
bool is_space_prepended = false;
|
||||||
|
bool processing_non_ws = false;
|
||||||
|
|
||||||
|
size_t input_len = input.size();
|
||||||
|
|
||||||
|
for (size_t input_offset = 0; input_offset < input_len; ) {
|
||||||
|
auto norm_res = normalize_prefix(input, input_offset);
|
||||||
|
for (size_t i = 0; i < norm_res.normalized_len; i++) {
|
||||||
|
char c = norm_res.normalized[i];
|
||||||
|
if (c != ' ') {
|
||||||
|
if (!processing_non_ws) {
|
||||||
|
processing_non_ws = true;
|
||||||
|
if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
|
||||||
|
normalized->append(space);
|
||||||
|
is_space_prepended = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
normalized->push_back(c);
|
||||||
|
} else {
|
||||||
|
if (processing_non_ws) {
|
||||||
|
processing_non_ws = false;
|
||||||
|
}
|
||||||
|
if (!shall_merge_spaces) {
|
||||||
|
normalized->append(space);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
input_offset += norm_res.consumed_input;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shall_append_space) {
|
||||||
|
normalized->append(space);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This structure is a view wrapper for XOR-compressed double array (XCDA)
|
||||||
|
* See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
|
||||||
|
* Eeach bit-packed entry contains:
|
||||||
|
* - BASE array value in bits 10-30
|
||||||
|
* - LCHECK array value in bits 0-7
|
||||||
|
* - LEAF array value in bit 9
|
||||||
|
* Entries containing indexes of replacement sequences have set bit 31
|
||||||
|
*/
|
||||||
|
struct xcda_array_view {
|
||||||
|
public:
|
||||||
|
xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
|
||||||
|
}
|
||||||
|
uint32_t get_base(size_t index) {
|
||||||
|
uint32_t packed_node = get_node(index);
|
||||||
|
return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
|
||||||
|
}
|
||||||
|
uint32_t get_lcheck(size_t index) {
|
||||||
|
uint32_t packed_node = get_node(index);
|
||||||
|
return packed_node & ((1U << 31) | 0xff);
|
||||||
|
}
|
||||||
|
bool get_leaf(size_t index) {
|
||||||
|
uint32_t packed_node = get_node(index);
|
||||||
|
return (packed_node >> 8) & 1;
|
||||||
|
}
|
||||||
|
uint32_t get_value(size_t index) {
|
||||||
|
uint32_t packed_node = get_node(index);
|
||||||
|
return packed_node & ((1U << 31) - 1);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
uint32_t get_node(size_t index) {
|
||||||
|
if (index > xcda_array_size) {
|
||||||
|
throw std::runtime_error("Index out of array bounds in XCDA array!");
|
||||||
|
}
|
||||||
|
return xcda_array[index];
|
||||||
|
}
|
||||||
|
const uint32_t * xcda_array;
|
||||||
|
size_t xcda_array_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
|
||||||
|
if (input_offset == input.size()) {
|
||||||
|
return { &input[input_offset], 0, 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// if input prefix matches some user-defined token return this token as normalization result
|
||||||
|
auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
|
||||||
|
if (user_defined_token_match.second > 0) {
|
||||||
|
return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t longest_prefix_length = 0;
|
||||||
|
size_t longest_prefix_offset = 0;
|
||||||
|
|
||||||
|
if (xcda_array_size > 0) {
|
||||||
|
struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
|
||||||
|
|
||||||
|
// Find the longest normalized sequence matching the input prefix by walking
|
||||||
|
// the XOR-compressed compact double array (XCDA) starting from the root node
|
||||||
|
// We find the index of the next node by calculating BASE[s] ^ c where s is
|
||||||
|
// the index of the previous node and c is a numerical character value
|
||||||
|
uint32_t node_index = 0;
|
||||||
|
// get BASE of the root node
|
||||||
|
node_index = xcda_view.get_base(node_index);
|
||||||
|
for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
|
||||||
|
unsigned char c = input[prefix_offset];
|
||||||
|
if (c == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
node_index ^= c;
|
||||||
|
// if value of LCHECK is not c it means that this is not a child of
|
||||||
|
// the previous node, so we stop matching
|
||||||
|
if (xcda_view.get_lcheck(node_index) != c) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bool is_leaf = xcda_view.get_leaf(node_index);
|
||||||
|
// get BASE of the current node
|
||||||
|
node_index ^= xcda_view.get_base(node_index);
|
||||||
|
// if LEAF of the current node is true, it means that its BASE points to the node
|
||||||
|
// containing index of replacement sequence for currently matched input prefix
|
||||||
|
if (is_leaf)
|
||||||
|
{
|
||||||
|
longest_prefix_length = prefix_offset - input_offset + 1;
|
||||||
|
// get index of replacement sequence for currently matched input prefix
|
||||||
|
longest_prefix_offset = xcda_view.get_value(node_index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (longest_prefix_length > 0) {
|
||||||
|
// we have a match, so return the replacement sequence
|
||||||
|
if (longest_prefix_offset >= prefix_replacements_size) {
|
||||||
|
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
||||||
|
}
|
||||||
|
const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
|
||||||
|
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
|
||||||
|
} else {
|
||||||
|
// check if the input prefix contains a valid sequence of UTF-8 code units
|
||||||
|
try {
|
||||||
|
// if yes, return this sequence unmodified
|
||||||
|
size_t prefix_offset = input_offset;
|
||||||
|
unicode_cpt_from_utf8(input, prefix_offset);
|
||||||
|
return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
|
||||||
|
} catch(std::invalid_argument & ex) {
|
||||||
|
// if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
|
||||||
|
return { "\xEF\xBF\xBD", 3, 1 };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// escaped space symbol - U+2581 (Lower One Eighth Block)
|
||||||
|
const std::string escaped_space = "\xE2\x96\x81";
|
||||||
|
|
||||||
|
const char * prefix_replacements = NULL;
|
||||||
|
size_t prefix_replacements_size = 0;
|
||||||
|
|
||||||
|
const uint32_t * xcda_array = NULL;
|
||||||
|
size_t xcda_array_size = 0;
|
||||||
|
|
||||||
|
struct naive_trie user_defined_token_matcher;
|
||||||
|
|
||||||
|
// this structure stores the best tokenization so far at input_offset
|
||||||
|
struct best_tokenization {
|
||||||
|
llama_token token_id;
|
||||||
|
size_t input_offset;
|
||||||
|
float score_sum;
|
||||||
|
};
|
||||||
|
|
||||||
|
float min_score = FLT_MAX;
|
||||||
|
float max_score = -FLT_MAX;
|
||||||
|
|
||||||
|
float unknown_token_score_penalty = 10.0;
|
||||||
|
float unknown_token_score;
|
||||||
|
|
||||||
|
struct naive_trie token_matcher;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
||||||
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
||||||
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
||||||
|
@ -14086,6 +14592,39 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
output.push_back(vocab.special_sep_id);
|
output.push_back(vocab.special_sep_id);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLAMA_VOCAB_TYPE_UGM:
|
||||||
|
{
|
||||||
|
llm_tokenizer_ugm tokenizer(vocab);
|
||||||
|
|
||||||
|
if (add_special && vocab.tokenizer_add_bos != 0) {
|
||||||
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||||
|
output.push_back(vocab.special_bos_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & fragment : fragment_buffer) {
|
||||||
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
#ifdef PRETOKENIZERDEBUG
|
||||||
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||||
|
#endif
|
||||||
|
tokenizer.tokenize(raw_text, output);
|
||||||
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
|
output.push_back(fragment.token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||||
|
LLAMA_LOG_WARN(
|
||||||
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||||
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||||
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (add_special && vocab.tokenizer_add_eos == 1) {
|
||||||
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||||
|
output.push_back(vocab.special_eos_id);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_NONE:
|
case LLAMA_VOCAB_TYPE_NONE:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -14500,7 +15039,8 @@ struct llama_grammar * llama_grammar_init(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||||
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
|
||||||
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16963,6 +17503,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
case LLM_ARCH_MAMBA:
|
case LLM_ARCH_MAMBA:
|
||||||
case LLM_ARCH_JINA_BERT_V2:
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
|
case LLM_ARCH_T5:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
@ -18658,6 +19199,10 @@ llama_token llama_token_eot(const struct llama_model * model) {
|
||||||
return model->vocab.special_eot_id;
|
return model->vocab.special_eot_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_pad(const struct llama_model * model) {
|
||||||
|
return model->vocab.special_pad_id;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_tokenize(
|
int32_t llama_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const char * text,
|
const char * text,
|
||||||
|
@ -18724,7 +19269,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
||||||
if (0 <= token && token < llama_n_vocab(model)) {
|
if (0 <= token && token < llama_n_vocab(model)) {
|
||||||
switch (llama_vocab_get_type(model->vocab)) {
|
switch (llama_vocab_get_type(model->vocab)) {
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
|
case LLAMA_VOCAB_TYPE_UGM: {
|
||||||
// NOTE: we accept all unsupported token types,
|
// NOTE: we accept all unsupported token types,
|
||||||
// suppressing them like CONTROL tokens.
|
// suppressing them like CONTROL tokens.
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
|
@ -18818,10 +19364,10 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
|
||||||
// llama2 template and its variants
|
// llama2 template and its variants
|
||||||
// [variant] support system message
|
// [variant] support system message
|
||||||
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
|
||||||
// [variant] space before + after response
|
// [variant] space before + after response
|
||||||
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
||||||
// [variant] add BOS inside history
|
// [variant] add BOS inside history
|
||||||
|
|
8
llama.h
8
llama.h
|
@ -67,6 +67,7 @@ extern "C" {
|
||||||
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||||
|
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||||
};
|
};
|
||||||
|
|
||||||
// pre-tokenization types
|
// pre-tokenization types
|
||||||
|
@ -857,6 +858,7 @@ extern "C" {
|
||||||
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||||
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
||||||
|
@ -924,6 +926,12 @@ extern "C" {
|
||||||
// Grammar
|
// Grammar
|
||||||
//
|
//
|
||||||
|
|
||||||
|
/// Initialize a llama_grammar.
|
||||||
|
///
|
||||||
|
/// @param rules The rule elements of the grammar to initialize.
|
||||||
|
/// @param n_rules The number of rules.
|
||||||
|
/// @param start_rule_index The index of the root rule (the starting point of the grammar).
|
||||||
|
/// @return The initialized llama_grammar or nullptr if initialization failed.
|
||||||
LLAMA_API struct llama_grammar * llama_grammar_init(
|
LLAMA_API struct llama_grammar * llama_grammar_init(
|
||||||
const llama_grammar_element ** rules,
|
const llama_grammar_element ** rules,
|
||||||
size_t n_rules,
|
size_t n_rules,
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
llama_chat_message conversation[] = {
|
llama_chat_message conversation[] = {
|
||||||
|
@ -119,5 +120,24 @@ int main(void) {
|
||||||
std::cout << output << "\n-------------------------\n";
|
std::cout << output << "\n-------------------------\n";
|
||||||
assert(output == expected);
|
assert(output == expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test llama_chat_format_single
|
||||||
|
std::cout << "\n\n=== llama_chat_format_single ===\n\n";
|
||||||
|
std::vector<llama_chat_msg> chat2;
|
||||||
|
chat2.push_back({"system", "You are a helpful assistant"});
|
||||||
|
chat2.push_back({"user", "Hello"});
|
||||||
|
chat2.push_back({"assistant", "I am assistant"});
|
||||||
|
llama_chat_msg new_msg{"user", "How are you"};
|
||||||
|
|
||||||
|
auto fmt_single = [&](std::string tmpl) {
|
||||||
|
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
|
||||||
|
std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
|
||||||
|
return output;
|
||||||
|
};
|
||||||
|
assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
|
||||||
|
assert(fmt_single("llama2") == "[INST] How are you [/INST]");
|
||||||
|
assert(fmt_single("gemma") == "<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
|
||||||
|
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,10 +36,10 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
|
||||||
static bool test_build_grammar_fails(const std::string & grammar_str) {
|
static bool test_build_grammar_fails(const std::string & grammar_str) {
|
||||||
fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
|
fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
|
||||||
bool grammar_fails = false;
|
bool grammar_fails = false;
|
||||||
try {
|
llama_grammar * grammar = build_grammar(grammar_str);
|
||||||
build_grammar(grammar_str);
|
if (grammar != nullptr) {
|
||||||
fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
|
fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
|
||||||
} catch (const std::exception & err) {
|
} else {
|
||||||
grammar_fails = true;
|
grammar_fails = true;
|
||||||
fprintf(stdout, " ✅︎\n");
|
fprintf(stdout, " ✅︎\n");
|
||||||
}
|
}
|
||||||
|
@ -148,6 +148,250 @@ static void test_schema(const std::string & test_desc, const std::string & schem
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_simple_grammar() {
|
static void test_simple_grammar() {
|
||||||
|
test_schema(
|
||||||
|
"min 0",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"0",
|
||||||
|
"10",
|
||||||
|
"12",
|
||||||
|
"10000",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"-1",
|
||||||
|
"-10",
|
||||||
|
"-10000",
|
||||||
|
"-100000000000000000000000000000000",
|
||||||
|
"100000000000000000000000000000000",
|
||||||
|
"00",
|
||||||
|
"01",
|
||||||
|
"-0",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"min 2",
|
||||||
|
// Schema
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 2
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"2",
|
||||||
|
"3",
|
||||||
|
"4",
|
||||||
|
"10",
|
||||||
|
"20",
|
||||||
|
"1234567890000000",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"-1",
|
||||||
|
"-100",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"01",
|
||||||
|
"02",
|
||||||
|
"12345678900000000",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"min 456",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 456
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"456",
|
||||||
|
"4560",
|
||||||
|
"457",
|
||||||
|
"460",
|
||||||
|
"500",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"455",
|
||||||
|
"356",
|
||||||
|
"50",
|
||||||
|
"050",
|
||||||
|
"-1",
|
||||||
|
"-456",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"min -123",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -123
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"-123",
|
||||||
|
"-122",
|
||||||
|
"-11",
|
||||||
|
"-1",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"123",
|
||||||
|
"1234",
|
||||||
|
"2345",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"-1234",
|
||||||
|
"-124",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"max 9999",
|
||||||
|
// Schema
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"maximum": 9999
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"-99999",
|
||||||
|
"0",
|
||||||
|
"9999",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"10000",
|
||||||
|
"99991",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"max -9999",
|
||||||
|
// Schema
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"maximum": -9999
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"-10000",
|
||||||
|
"-9999",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"-9998",
|
||||||
|
"0",
|
||||||
|
"9999",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"min 5 max 30",
|
||||||
|
// Schema
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 5,
|
||||||
|
"maximum": 30
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"5",
|
||||||
|
"10",
|
||||||
|
"30",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"05",
|
||||||
|
"4",
|
||||||
|
"-1",
|
||||||
|
"31",
|
||||||
|
"123",
|
||||||
|
"0123",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"min -1 max 1",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -1,
|
||||||
|
"maximum": 1
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"-1",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"-11",
|
||||||
|
"-10",
|
||||||
|
"-2",
|
||||||
|
"2",
|
||||||
|
"10",
|
||||||
|
"11",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"min -123 max 42",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -123,
|
||||||
|
"maximum": 42
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"-123",
|
||||||
|
"-122",
|
||||||
|
"-13",
|
||||||
|
"-11",
|
||||||
|
"-2",
|
||||||
|
"-1",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"5",
|
||||||
|
"10",
|
||||||
|
"39",
|
||||||
|
"40",
|
||||||
|
"42",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"-0123",
|
||||||
|
"-124",
|
||||||
|
"-1123",
|
||||||
|
"-200",
|
||||||
|
"43",
|
||||||
|
"123",
|
||||||
|
"0123",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
test_schema(
|
||||||
|
"exclusive min / max",
|
||||||
|
// Schema
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"exclusiveMinimum": 0,
|
||||||
|
"exclusiveMaximum": 10000
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"1",
|
||||||
|
"9999",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"0",
|
||||||
|
"01",
|
||||||
|
"10000",
|
||||||
|
"99999",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
// Test case for a simple grammar
|
// Test case for a simple grammar
|
||||||
test_grammar(
|
test_grammar(
|
||||||
"simple grammar",
|
"simple grammar",
|
||||||
|
|
|
@ -80,6 +80,232 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||||
runner(tc);
|
runner(tc);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 0",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([0] | [1-9] [0-9]{0,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 1",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 1
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([1-9] [0-9]{0,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 3",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 3
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 9",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 9
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 10",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 10
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 25",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 25
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"max 30",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"maximum": 30
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min -5",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -5
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min -123",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -123
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"max -5",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"maximum": -5
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"max 1",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"maximum": 1
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"max 100",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"maximum": 100
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 0 max 23",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"maximum": 23
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 15 max 300",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 15,
|
||||||
|
"maximum": 300
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min 5 max 30",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 5,
|
||||||
|
"maximum": 30
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min -123 max 42",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -123,
|
||||||
|
"maximum": 42
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min -10 max 10",
|
||||||
|
R"""({
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -10,
|
||||||
|
"maximum": 10
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
test({
|
test({
|
||||||
FAILURE,
|
FAILURE,
|
||||||
"unknown type",
|
"unknown type",
|
||||||
|
@ -422,6 +648,44 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||||
)"""
|
)"""
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min + max items with min + max values across zero",
|
||||||
|
R"""({
|
||||||
|
"items": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": -12,
|
||||||
|
"maximum": 207
|
||||||
|
},
|
||||||
|
"minItems": 3,
|
||||||
|
"maxItems": 5
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
|
||||||
|
root ::= "[" space item ("," space item){2,4} "]" space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
|
test({
|
||||||
|
SUCCESS,
|
||||||
|
"min + max items with min + max values",
|
||||||
|
R"""({
|
||||||
|
"items": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 12,
|
||||||
|
"maximum": 207
|
||||||
|
},
|
||||||
|
"minItems": 3,
|
||||||
|
"maxItems": 5
|
||||||
|
})""",
|
||||||
|
R"""(
|
||||||
|
item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
|
||||||
|
root ::= "[" space item ("," space item){2,4} "]" space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
)"""
|
||||||
|
});
|
||||||
|
|
||||||
test({
|
test({
|
||||||
SUCCESS,
|
SUCCESS,
|
||||||
"simple regexp",
|
"simple regexp",
|
||||||
|
|
|
@ -116,6 +116,10 @@ int main()
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||||
grammar = llama_grammar_init(
|
grammar = llama_grammar_init(
|
||||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::vector<llama_grammar_element>> expected_stacks = {
|
std::vector<std::vector<llama_grammar_element>> expected_stacks = {
|
||||||
{
|
{
|
||||||
|
|
|
@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||||
assert(offset < utf8.size());
|
assert(offset < utf8.size());
|
||||||
if (!(utf8[offset + 0] & 0x80)) {
|
if (!(utf8[offset + 0] & 0x80)) {
|
||||||
auto result = utf8[offset + 0];
|
auto result = utf8[offset + 0];
|
||||||
|
|
|
@ -48,6 +48,7 @@ struct codepoint_flags {
|
||||||
|
|
||||||
|
|
||||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||||
|
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue