Merge branch 'master' into fix-logging-main.cpp

This commit is contained in:
Kurt Manucredo 2024-10-11 15:20:06 +02:00 committed by GitHub
commit fc06b628f8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
63 changed files with 1844 additions and 1643 deletions

View file

@ -0,0 +1,26 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -0,0 +1,30 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the MUSA runtime image
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
RUN apt-get update && \
apt-get install -y build-essential git cmake
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc)
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]

View file

@ -0,0 +1,35 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the MUSA runtime image
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc)
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -43,6 +43,9 @@ jobs:
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }

View file

@ -63,7 +63,7 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# utils # utils
option(LLAMA_BUILD_COMMON "llama: build common utils library" ON) option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
# extra artifacts # extra artifacts
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@ -201,12 +201,12 @@ if (LLAMA_BUILD_COMMON)
add_subdirectory(common) add_subdirectory(common)
endif() endif()
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest) include(CTest)
add_subdirectory(tests) add_subdirectory(tests)
endif() endif()
if (LLAMA_BUILD_EXAMPLES) if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)
add_subdirectory(pocs) add_subdirectory(pocs)
endif() endif()

File diff suppressed because it is too large Load diff

View file

@ -10,7 +10,7 @@
// CLI argument parsing // CLI argument parsing
// //
struct llama_arg { struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args; std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value const char * value_hint = nullptr; // help text or example for arg value
@ -18,60 +18,60 @@ struct llama_arg {
const char * env = nullptr; const char * env = nullptr;
std::string help; std::string help;
bool is_sparam = false; // is current arg a sampling param? bool is_sparam = false; // is current arg a sampling param?
void (*handler_void) (gpt_params & params) = nullptr; void (*handler_void) (common_params & params) = nullptr;
void (*handler_string) (gpt_params & params, const std::string &) = nullptr; void (*handler_string) (common_params & params, const std::string &) = nullptr;
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (gpt_params & params, int) = nullptr; void (*handler_int) (common_params & params, int) = nullptr;
llama_arg( common_arg(
const std::initializer_list<const char *> & args, const std::initializer_list<const char *> & args,
const char * value_hint, const char * value_hint,
const std::string & help, const std::string & help,
void (*handler)(gpt_params & params, const std::string &) void (*handler)(common_params & params, const std::string &)
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
llama_arg( common_arg(
const std::initializer_list<const char *> & args, const std::initializer_list<const char *> & args,
const char * value_hint, const char * value_hint,
const std::string & help, const std::string & help,
void (*handler)(gpt_params & params, int) void (*handler)(common_params & params, int)
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
llama_arg( common_arg(
const std::initializer_list<const char *> & args, const std::initializer_list<const char *> & args,
const std::string & help, const std::string & help,
void (*handler)(gpt_params & params) void (*handler)(common_params & params)
) : args(args), help(help), handler_void(handler) {} ) : args(args), help(help), handler_void(handler) {}
// support 2 values for arg // support 2 values for arg
llama_arg( common_arg(
const std::initializer_list<const char *> & args, const std::initializer_list<const char *> & args,
const char * value_hint, const char * value_hint,
const char * value_hint_2, const char * value_hint_2,
const std::string & help, const std::string & help,
void (*handler)(gpt_params & params, const std::string &, const std::string &) void (*handler)(common_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
llama_arg & set_examples(std::initializer_list<enum llama_example> examples); common_arg & set_examples(std::initializer_list<enum llama_example> examples);
llama_arg & set_env(const char * env); common_arg & set_env(const char * env);
llama_arg & set_sparam(); common_arg & set_sparam();
bool in_example(enum llama_example ex); bool in_example(enum llama_example ex);
bool get_value_from_env(std::string & output); bool get_value_from_env(std::string & output);
bool has_value_from_env(); bool has_value_from_env();
std::string to_string(); std::string to_string();
}; };
struct gpt_params_context { struct common_params_context {
enum llama_example ex = LLAMA_EXAMPLE_COMMON; enum llama_example ex = LLAMA_EXAMPLE_COMMON;
gpt_params & params; common_params & params;
std::vector<llama_arg> options; std::vector<common_arg> options;
void(*print_usage)(int, char **) = nullptr; void(*print_usage)(int, char **) = nullptr;
gpt_params_context(gpt_params & params) : params(params) {} common_params_context(common_params & params) : params(params) {}
}; };
// parse input arguments from CLI // parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// function to be used by test-arg-parser // function to be used by test-arg-parser
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

View file

@ -362,10 +362,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
return true; return true;
} }
void gpt_init() { void common_init() {
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
gpt_log_add(gpt_log_main(), level, "%s", text); common_log_add(common_log_main(), level, "%s", text);
} }
}, NULL); }, NULL);
@ -378,7 +378,7 @@ void gpt_init() {
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
} }
std::string gpt_params_get_system_info(const gpt_params & params) { std::string common_params_get_system_info(const common_params & params) {
std::ostringstream os; std::ostringstream os;
os << "system_info: n_threads = " << params.cpuparams.n_threads; os << "system_info: n_threads = " << params.cpuparams.n_threads;
@ -493,7 +493,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
first = false; first = false;
} }
auto detokenized = llama_token_to_piece(ctx, token); auto detokenized = common_token_to_piece(ctx, token);
detokenized.erase( detokenized.erase(
std::remove_if( std::remove_if(
@ -524,7 +524,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
first = false; first = false;
} }
auto detokenized = llama_token_to_piece(ctx, batch.token[i]); auto detokenized = common_token_to_piece(ctx, batch.token[i]);
detokenized.erase( detokenized.erase(
std::remove_if( std::remove_if(
@ -819,16 +819,16 @@ std::string fs_get_cache_file(const std::string & filename) {
// //
// Model utils // Model utils
// //
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { struct common_init_result common_init_from_params(common_params & params) {
llama_init_result iparams; common_init_result iparams;
auto mparams = llama_model_params_from_gpt_params(params); auto mparams = common_model_params_to_llama(params);
llama_model * model = nullptr; llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) { if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else { } else {
model = llama_load_model_from_file(params.model.c_str(), mparams); model = llama_load_model_from_file(params.model.c_str(), mparams);
} }
@ -863,7 +863,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
} }
} }
auto cparams = llama_context_params_from_gpt_params(params); auto cparams = common_context_params_to_llama(params);
llama_context * lctx = llama_new_context_with_model(model, cparams); llama_context * lctx = llama_new_context_with_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
@ -876,7 +876,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
const auto cvec = llama_control_vector_load(params.control_vectors); const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) { if (cvec.n_embd == -1) {
llama_free(lctx); llama_free(lctx);
llama_free_model(model); llama_free_model(model);
@ -900,7 +900,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
// load and optionally apply lora adapters // load and optionally apply lora adapters
for (auto & la : params.lora_adapters) { for (auto & la : params.lora_adapters) {
llama_lora_adapter_container loaded_la; common_lora_adapter_container loaded_la;
loaded_la.path = la.path; loaded_la.path = la.path;
loaded_la.scale = la.scale; loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@ -913,7 +913,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
} }
if (!params.lora_init_without_apply) { if (!params.lora_init_without_apply) {
llama_lora_adapters_apply(lctx, iparams.lora_adapters); common_lora_adapters_apply(lctx, iparams.lora_adapters);
} }
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@ -961,7 +961,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
return iparams; return iparams;
} }
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) { void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
llama_lora_adapter_clear(ctx); llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) { for (auto & la : lora_adapters) {
if (la.scale != 0.0f) { if (la.scale != 0.0f) {
@ -970,7 +970,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
} }
} }
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { struct llama_model_params common_model_params_to_llama(const common_params & params) {
auto mparams = llama_model_default_params(); auto mparams = llama_model_default_params();
if (params.n_gpu_layers != -1) { if (params.n_gpu_layers != -1) {
@ -1022,7 +1022,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
throw std::runtime_error("Invalid cache type: " + s); throw std::runtime_error("Invalid cache type: " + s);
} }
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { struct llama_context_params common_context_params_to_llama(const common_params & params) {
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
cparams.n_ctx = params.n_ctx; cparams.n_ctx = params.n_ctx;
@ -1112,7 +1112,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
return false; return false;
} }
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@ -1182,15 +1182,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
} }
// Send a HEAD request to retrieve the etag and last-modified headers // Send a HEAD request to retrieve the etag and last-modified headers
struct llama_load_model_from_url_headers { struct common_load_model_from_url_headers {
std::string etag; std::string etag;
std::string last_modified; std::string last_modified;
}; };
llama_load_model_from_url_headers headers; common_load_model_from_url_headers headers;
{ {
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase); static std::regex etag_regex("ETag", std::regex_constants::icase);
@ -1326,7 +1326,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
return true; return true;
} }
struct llama_model * llama_load_model_from_url( struct llama_model * common_load_model_from_url(
const char * model_url, const char * model_url,
const char * path_model, const char * path_model,
const char * hf_token, const char * hf_token,
@ -1337,7 +1337,7 @@ struct llama_model * llama_load_model_from_url(
return NULL; return NULL;
} }
if (!llama_download_file(model_url, path_model, hf_token)) { if (!common_download_file(model_url, path_model, hf_token)) {
return NULL; return NULL;
} }
@ -1390,7 +1390,7 @@ struct llama_model * llama_load_model_from_url(
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return llama_download_file(split_url, split_path, hf_token); return common_download_file(split_url, split_path, hf_token);
}, idx)); }, idx));
} }
@ -1405,7 +1405,7 @@ struct llama_model * llama_load_model_from_url(
return llama_load_model_from_file(path_model, params); return llama_load_model_from_file(path_model, params);
} }
struct llama_model * llama_load_model_from_hf( struct llama_model * common_load_model_from_hf(
const char * repo, const char * repo,
const char * model, const char * model,
const char * path_model, const char * path_model,
@ -1425,12 +1425,12 @@ struct llama_model * llama_load_model_from_hf(
model_url += "/resolve/main/"; model_url += "/resolve/main/";
model_url += model; model_url += model;
return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params); return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
} }
#else #else
struct llama_model * llama_load_model_from_url( struct llama_model * common_load_model_from_url(
const char * /*model_url*/, const char * /*model_url*/,
const char * /*path_model*/, const char * /*path_model*/,
const char * /*hf_token*/, const char * /*hf_token*/,
@ -1439,7 +1439,7 @@ struct llama_model * llama_load_model_from_url(
return nullptr; return nullptr;
} }
struct llama_model * llama_load_model_from_hf( struct llama_model * common_load_model_from_hf(
const char * /*repo*/, const char * /*repo*/,
const char * /*model*/, const char * /*model*/,
const char * /*path_model*/, const char * /*path_model*/,
@ -1455,11 +1455,11 @@ struct llama_model * llama_load_model_from_hf(
// Batch utils // Batch utils
// //
void llama_batch_clear(struct llama_batch & batch) { void common_batch_clear(struct llama_batch & batch) {
batch.n_tokens = 0; batch.n_tokens = 0;
} }
void llama_batch_add( void common_batch_add(
struct llama_batch & batch, struct llama_batch & batch,
llama_token id, llama_token id,
llama_pos pos, llama_pos pos,
@ -1482,15 +1482,15 @@ void llama_batch_add(
// Vocab utils // Vocab utils
// //
std::vector<llama_token> llama_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_context * ctx, const struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
} }
std::vector<llama_token> llama_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_model * model, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
@ -1509,7 +1509,7 @@ std::vector<llama_token> llama_tokenize(
return result; return result;
} }
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::string piece; std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@ -1525,7 +1525,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
return piece; return piece;
} }
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) { std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string text; std::string text;
text.resize(std::max(text.capacity(), tokens.size())); text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@ -1545,15 +1545,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
// Chat template utils // Chat template utils
// //
bool llama_chat_verify_template(const std::string & tmpl) { bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}}; llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0; return res >= 0;
} }
std::string llama_chat_apply_template(const struct llama_model * model, std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<llama_chat_msg> & msgs, const std::vector<common_chat_msg> & msgs,
bool add_ass) { bool add_ass) {
int alloc_size = 0; int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml bool fallback = false; // indicate if we must fallback to default chatml
@ -1595,42 +1595,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
return formatted_chat; return formatted_chat;
} }
std::string llama_chat_format_single(const struct llama_model * model, std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg, const std::vector<common_chat_msg> & past_msg,
const llama_chat_msg & new_msg, const common_chat_msg & new_msg,
bool add_ass) { bool add_ass) {
std::ostringstream ss; std::ostringstream ss;
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg); std::vector<common_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version // if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
ss << "\n"; ss << "\n";
}; };
// format chat with new_msg // format chat with new_msg
chat_new.push_back(new_msg); chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
// get the diff part // get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str(); return ss.str();
} }
std::string llama_chat_format_example(const struct llama_model * model, std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl) { const std::string & tmpl) {
std::vector<llama_chat_msg> msgs = { std::vector<common_chat_msg> msgs = {
{"system", "You are a helpful assistant"}, {"system", "You are a helpful assistant"},
{"user", "Hello"}, {"user", "Hello"},
{"assistant", "Hi there"}, {"assistant", "Hi there"},
{"user", "How are you?"}, {"user", "How are you?"},
}; };
return llama_chat_apply_template(model, tmpl, msgs, true); return common_chat_apply_template(model, tmpl, msgs, true);
} }
// //
// KV cache utils // KV cache utils
// //
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@ -1653,7 +1653,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
printf("\n=== Done dumping\n"); printf("\n=== Done dumping\n");
} }
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@ -1705,7 +1705,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils // Embedding utils
// //
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) { void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
double sum = 0.0; double sum = 0.0;
switch (embd_norm) { switch (embd_norm) {
@ -1739,7 +1739,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
} }
} }
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
double sum = 0.0; double sum = 0.0;
double sum1 = 0.0; double sum1 = 0.0;
double sum2 = 0.0; double sum2 = 0.0;
@ -1765,8 +1765,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
// Control vector utils // Control vector utils
// //
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
llama_control_vector_data result = { -1, {} }; common_control_vector_data result = { -1, {} };
ggml_context * ctx = nullptr; ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = { struct gguf_init_params meta_gguf_params = {
@ -1850,11 +1850,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
return result; return result;
} }
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) { common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
llama_control_vector_data result = { -1, {} }; common_control_vector_data result = { -1, {} };
for (const auto & info : load_infos) { for (const auto & info : load_infos) {
auto cur = llama_control_vector_load_one(info); auto cur = common_control_vector_load_one(info);
if (cur.n_embd == -1) { if (cur.n_embd == -1) {
result.n_embd = -1; result.n_embd = -1;
@ -1946,7 +1946,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
} }
} }
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) { const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const auto & sparams = params.sparams; const auto & sparams = params.sparams;

View file

@ -24,12 +24,12 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct llama_lora_adapter_info { struct common_lora_adapter_info {
std::string path; std::string path;
float scale; float scale;
}; };
struct llama_lora_adapter_container : llama_lora_adapter_info { struct common_lora_adapter_container : common_lora_adapter_info {
struct llama_lora_adapter * adapter; struct llama_lora_adapter * adapter;
}; };
@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER; extern char const * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET; extern char const * LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info; struct common_control_vector_load_info;
// //
// CPU utils // CPU utils
@ -82,14 +82,14 @@ enum llama_example {
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
enum gpt_sampler_type { enum common_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0, COMMON_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1, COMMON_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2, COMMON_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3, COMMON_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4, COMMON_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5, COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6, COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
@ -99,7 +99,7 @@ enum dimre_method {
}; };
// sampler parameters // sampler parameters
struct gpt_sampler_params { struct common_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
@ -124,13 +124,13 @@ struct gpt_sampler_params {
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
std::vector<enum gpt_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z, COMMON_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P, COMMON_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P, COMMON_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE COMMON_SAMPLER_TYPE_TEMPERATURE
}; };
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling
@ -141,7 +141,7 @@ struct gpt_sampler_params {
std::string print() const; std::string print() const;
}; };
struct gpt_params { struct common_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -183,7 +183,7 @@ struct gpt_params {
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
struct gpt_sampler_params sparams; struct common_sampler_params sparams;
std::string model = ""; // model path // NOLINT std::string model = ""; // model path // NOLINT
std::string model_draft = ""; // draft model for speculative decoding // NOLINT std::string model_draft = ""; // draft model for speculative decoding // NOLINT
@ -208,9 +208,9 @@ struct gpt_params {
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0; int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_start = -1; // layer range for control vector
@ -348,9 +348,9 @@ struct gpt_params {
// call once at the start of a program if it uses libcommon // call once at the start of a program if it uses libcommon
// initializes the logging system and prints info about the build // initializes the logging system and prints info about the build
void gpt_init(); void common_init();
std::string gpt_params_get_system_info(const gpt_params & params); std::string common_params_get_system_info(const common_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
@ -404,29 +404,29 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils // Model utils
// //
struct llama_init_result { struct common_init_result {
struct llama_model * model = nullptr; struct llama_model * model = nullptr;
struct llama_context * context = nullptr; struct llama_context * context = nullptr;
std::vector<llama_lora_adapter_container> lora_adapters; std::vector<common_lora_adapter_container> lora_adapters;
}; };
struct llama_init_result llama_init_from_gpt_params(gpt_params & params); struct common_init_result common_init_from_params(common_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_model_params common_model_params_to_llama (const common_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters); void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
// Batch utils // Batch utils
void llama_batch_clear(struct llama_batch & batch); void common_batch_clear(struct llama_batch & batch);
void llama_batch_add( void common_batch_add(
struct llama_batch & batch, struct llama_batch & batch,
llama_token id, llama_token id,
llama_pos pos, llama_pos pos,
@ -439,13 +439,13 @@ void llama_batch_add(
// tokenizes a string into a vector of tokens // tokenizes a string into a vector of tokens
// should work similar to Python's `tokenizer.encode` // should work similar to Python's `tokenizer.encode`
std::vector<llama_token> llama_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_context * ctx, const struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false);
std::vector<llama_token> llama_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_model * model, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
@ -453,7 +453,7 @@ std::vector<llama_token> llama_tokenize(
// tokenizes a token into a piece, optionally renders special/control tokens // tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece` // should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece( std::string common_token_to_piece(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token, llama_token token,
bool special = true); bool special = true);
@ -461,7 +461,7 @@ std::string llama_token_to_piece(
// detokenizes a vector of tokens into a string // detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode` // should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens // optionally renders special/control tokens
std::string llama_detokenize( std::string common_detokenize(
llama_context * ctx, llama_context * ctx,
const std::vector<llama_token> & tokens, const std::vector<llama_token> & tokens,
bool special = true); bool special = true);
@ -471,31 +471,31 @@ std::string llama_detokenize(
// //
// same with llama_chat_message, but uses std::string // same with llama_chat_message, but uses std::string
struct llama_chat_msg { struct common_chat_msg {
std::string role; std::string role;
std::string content; std::string content;
}; };
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl); bool common_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template // CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml // If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error // If the custom "tmpl" is not supported, we throw an error
std::string llama_chat_apply_template(const struct llama_model * model, std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<llama_chat_msg> & chat, const std::vector<common_chat_msg> & chat,
bool add_ass); bool add_ass);
// Format single message, while taking into account the position of that message in chat history // Format single message, while taking into account the position of that message in chat history
std::string llama_chat_format_single(const struct llama_model * model, std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg, const std::vector<common_chat_msg> & past_msg,
const llama_chat_msg & new_msg, const common_chat_msg & new_msg,
bool add_ass); bool add_ass);
// Returns an example of formatted chat // Returns an example of formatted chat
std::string llama_chat_format_example(const struct llama_model * model, std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl); const std::string & tmpl);
// //
@ -503,31 +503,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
// //
// Dump the KV cache view with the number of sequences per cell. // Dump the KV cache view with the number of sequences per cell.
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output). // Dump the KV cache view showing individual sequences in each cell (long output).
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
// //
// Embedding utils // Embedding utils
// //
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
// //
// Control vector utils // Control vector utils
// //
struct llama_control_vector_data { struct common_control_vector_data {
int n_embd; int n_embd;
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
std::vector<float> data; std::vector<float> data;
}; };
struct llama_control_vector_load_info { struct common_control_vector_load_info {
float strength; float strength;
std::string fname; std::string fname;
@ -535,7 +535,7 @@ struct llama_control_vector_load_info {
// Load control vectors, scale each by strength, and add them together. // Load control vectors, scale each by strength, and add them together.
// On error, returns {-1, empty} // On error, returns {-1, empty}
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos); common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
// //
// Split utils // Split utils
@ -554,5 +554,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info( void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx, FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc); const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View file

@ -8,10 +8,10 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA; int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void gpt_log_set_verbosity_thold(int verbosity) { void common_log_set_verbosity_thold(int verbosity) {
gpt_log_verbosity_thold = verbosity; common_log_verbosity_thold = verbosity;
} }
#define LOG_COL_DEFAULT "\033[0m" #define LOG_COL_DEFAULT "\033[0m"
@ -29,16 +29,16 @@ static int64_t t_us() {
} }
// colors // colors
enum gpt_log_col : int { enum common_log_col : int {
GPT_LOG_COL_DEFAULT = 0, COMMON_LOG_COL_DEFAULT = 0,
GPT_LOG_COL_BOLD, COMMON_LOG_COL_BOLD,
GPT_LOG_COL_RED, COMMON_LOG_COL_RED,
GPT_LOG_COL_GREEN, COMMON_LOG_COL_GREEN,
GPT_LOG_COL_YELLOW, COMMON_LOG_COL_YELLOW,
GPT_LOG_COL_BLUE, COMMON_LOG_COL_BLUE,
GPT_LOG_COL_MAGENTA, COMMON_LOG_COL_MAGENTA,
GPT_LOG_COL_CYAN, COMMON_LOG_COL_CYAN,
GPT_LOG_COL_WHITE, COMMON_LOG_COL_WHITE,
}; };
// disable colors by default // disable colors by default
@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
"", "",
}; };
struct gpt_log_entry { struct common_log_entry {
enum ggml_log_level level; enum ggml_log_level level;
bool prefix; bool prefix;
@ -71,7 +71,7 @@ struct gpt_log_entry {
if (!fcur) { if (!fcur) {
// stderr displays DBG messages only when their verbosity level is not higher than the threshold // stderr displays DBG messages only when their verbosity level is not higher than the threshold
// these messages will still be logged to a file // these messages will still be logged to a file
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) { if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
return; return;
} }
@ -86,19 +86,19 @@ struct gpt_log_entry {
if (timestamp) { if (timestamp) {
// [M.s.ms.us] // [M.s.ms.us]
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
g_col[GPT_LOG_COL_BLUE], g_col[COMMON_LOG_COL_BLUE],
(int) (timestamp / 1000000 / 60), (int) (timestamp / 1000000 / 60),
(int) (timestamp / 1000000 % 60), (int) (timestamp / 1000000 % 60),
(int) (timestamp / 1000 % 1000), (int) (timestamp / 1000 % 1000),
(int) (timestamp % 1000), (int) (timestamp % 1000),
g_col[GPT_LOG_COL_DEFAULT]); g_col[COMMON_LOG_COL_DEFAULT]);
} }
switch (level) { switch (level) {
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break; case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break; case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break; case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break; case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
default: default:
break; break;
} }
@ -107,18 +107,18 @@ struct gpt_log_entry {
fprintf(fcur, "%s", msg.data()); fprintf(fcur, "%s", msg.data());
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]); fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
} }
fflush(fcur); fflush(fcur);
} }
}; };
struct gpt_log { struct common_log {
// default capacity - will be expanded if needed // default capacity - will be expanded if needed
gpt_log() : gpt_log(256) {} common_log() : common_log(256) {}
gpt_log(size_t capacity) { common_log(size_t capacity) {
file = nullptr; file = nullptr;
prefix = false; prefix = false;
timestamps = false; timestamps = false;
@ -137,7 +137,7 @@ struct gpt_log {
resume(); resume();
} }
~gpt_log() { ~common_log() {
pause(); pause();
if (file) { if (file) {
fclose(file); fclose(file);
@ -158,12 +158,12 @@ private:
int64_t t_start; int64_t t_start;
// ring buffer of entries // ring buffer of entries
std::vector<gpt_log_entry> entries; std::vector<common_log_entry> entries;
size_t head; size_t head;
size_t tail; size_t tail;
// worker thread copies into this // worker thread copies into this
gpt_log_entry cur; common_log_entry cur;
public: public:
void add(enum ggml_log_level level, const char * fmt, va_list args) { void add(enum ggml_log_level level, const char * fmt, va_list args) {
@ -219,7 +219,7 @@ public:
tail = (tail + 1) % entries.size(); tail = (tail + 1) % entries.size();
if (tail == head) { if (tail == head) {
// expand the buffer // expand the buffer
std::vector<gpt_log_entry> new_entries(2*entries.size()); std::vector<common_log_entry> new_entries(2*entries.size());
size_t new_tail = 0; size_t new_tail = 0;
@ -320,15 +320,15 @@ public:
pause(); pause();
if (colors) { if (colors) {
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD; g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[GPT_LOG_COL_RED] = LOG_COL_RED; g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN; g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW; g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE; g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN; g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE; g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
} else { } else {
for (size_t i = 0; i < g_col.size(); i++) { for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = ""; g_col[i] = "";
@ -355,47 +355,47 @@ public:
// public API // public API
// //
struct gpt_log * gpt_log_init() { struct common_log * common_log_init() {
return new gpt_log; return new common_log;
} }
struct gpt_log * gpt_log_main() { struct common_log * common_log_main() {
static struct gpt_log log; static struct common_log log;
return &log; return &log;
} }
void gpt_log_pause(struct gpt_log * log) { void common_log_pause(struct common_log * log) {
log->pause(); log->pause();
} }
void gpt_log_resume(struct gpt_log * log) { void common_log_resume(struct common_log * log) {
log->resume(); log->resume();
} }
void gpt_log_free(struct gpt_log * log) { void common_log_free(struct common_log * log) {
delete log; delete log;
} }
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) { void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
va_list args; va_list args;
va_start(args, fmt); va_start(args, fmt);
log->add(level, fmt, args); log->add(level, fmt, args);
va_end(args); va_end(args);
} }
void gpt_log_set_file(struct gpt_log * log, const char * file) { void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file); log->set_file(file);
} }
void gpt_log_set_colors(struct gpt_log * log, bool colors) { void common_log_set_colors(struct common_log * log, bool colors) {
log->set_colors(colors); log->set_colors(colors);
} }
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) { void common_log_set_prefix(struct common_log * log, bool prefix) {
log->set_prefix(prefix); log->set_prefix(prefix);
} }
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) { void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps); log->set_timestamps(timestamps);
} }

View file

@ -14,23 +14,23 @@
#define LOG_DEFAULT_LLAMA 0 #define LOG_DEFAULT_LLAMA 0
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via gpt_log_set_verbosity() // set via common_log_set_verbosity()
extern int gpt_log_verbosity_thold; extern int common_log_verbosity_thold;
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe void common_log_set_verbosity_thold(int verbosity); // not thread-safe
// the gpt_log uses an internal worker thread to print/write log messages // the common_log uses an internal worker thread to print/write log messages
// when the worker thread is paused, incoming log messages are discarded // when the worker thread is paused, incoming log messages are discarded
struct gpt_log; struct common_log;
struct gpt_log * gpt_log_init(); struct common_log * common_log_init();
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
void gpt_log_free (struct gpt_log * log); void common_log_free (struct common_log * log);
LOG_ATTRIBUTE_FORMAT(3, 4) LOG_ATTRIBUTE_FORMAT(3, 4)
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...); void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
// defaults: file = NULL, colors = false, prefix = false, timestamps = false // defaults: file = NULL, colors = false, prefix = false, timestamps = false
// //
@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f
// D - debug (stderr, V = LOG_DEFAULT_DEBUG) // D - debug (stderr, V = LOG_DEFAULT_DEBUG)
// //
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging // helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
// //
// LOG_DBG("this is a debug message: %d\n", expensive_function()); // LOG_DBG("this is a debug message: %d\n", expensive_function());
// //
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold // this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
// //
#define LOG_TMPL(level, verbosity, ...) \ #define LOG_TMPL(level, verbosity, ...) \
do { \ do { \
if ((verbosity) <= gpt_log_verbosity_thold) { \ if ((verbosity) <= common_log_verbosity_thold) { \
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \ common_log_add(common_log_main(), (level), __VA_ARGS__); \
} \ } \
} while (0) } while (0)

View file

@ -8,7 +8,7 @@
#include <fstream> #include <fstream>
#include <thread> #include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) { std::vector<llama_token> & inp, int nnew, bool print_progress) {
const int64_t t_start_ms = ggml_time_ms(); const int64_t t_start_ms = ggml_time_ms();
const int64_t inp_size = inp.size(); const int64_t inp_size = inp.size();
@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
const int64_t i_start = std::max(inp_size - nnew, ngram_size); const int64_t i_start = std::max(inp_size - nnew, ngram_size);
for (int64_t i = i_start; i < inp_size; ++i) { for (int64_t i = i_start; i < inp_size; ++i) {
const int64_t ngram_start = i - ngram_size; const int64_t ngram_start = i - ngram_size;
llama_ngram ngram(&inp[ngram_start], ngram_size); common_ngram ngram(&inp[ngram_start], ngram_size);
const llama_token token = inp[i]; const llama_token token = inp[i];
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
if (part_it == ngram_cache.end()) { if (part_it == ngram_cache.end()) {
llama_ngram_cache_part part; common_ngram_cache_part part;
part.emplace(token, 1); part.emplace(token, 1);
ngram_cache.emplace(ngram, part); ngram_cache.emplace(ngram, part);
} else { } else {
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
if (token_count_it == part_it->second.end()) { if (token_count_it == part_it->second.end()) {
part_it->second.emplace(token, 1); part_it->second.emplace(token, 1);
} else { } else {
@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
// Helper function that tries to draft a token from only the static ngram cache: // Helper function that tries to draft a token from only the static ngram cache:
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) { if (part_static_it == nc_static.end()) {
return -1; return -1;
} }
const llama_ngram_cache_part part_static = part_static_it->second; const common_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0; int max_count_static = 0;
int sum_count_static = 0; int sum_count_static = 0;
@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
// Try to draft a token from primary cache (context/dynamic), validate with static cache: // Try to draft a token from primary cache (context/dynamic), validate with static cache:
static llama_token try_draft( static llama_token try_draft(
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static, common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) { const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = -1; llama_token drafted_token = -1;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
const llama_ngram ngram_primary = ngrams_primary[i]; const common_ngram ngram_primary = ngrams_primary[i];
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
if (part_primary_it == nc_primary.end()) { if (part_primary_it == nc_primary.end()) {
continue; continue;
} }
const llama_ngram_cache_part part_primary = part_primary_it->second; const common_ngram_cache_part part_primary = part_primary_it->second;
int max_count_primary = 0; int max_count_primary = 0;
int max_count_static = 0; int max_count_static = 0;
@ -117,7 +117,7 @@ static llama_token try_draft(
for (std::pair<llama_token, int> token_count_primary : part_primary) { for (std::pair<llama_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first; const llama_token token = token_count_primary.first;
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
const int32_t count_primary = token_count_primary.second; const int32_t count_primary = token_count_primary.second;
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
@ -142,9 +142,9 @@ static llama_token try_draft(
return drafted_token; return drafted_token;
} }
void llama_ngram_cache_draft( void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
) { ) {
GGML_ASSERT(draft.size() == 1); GGML_ASSERT(draft.size() == 1);
const int inp_size = inp.size(); const int inp_size = inp.size();
@ -157,21 +157,21 @@ void llama_ngram_cache_draft(
llama_token drafted_token = -1; llama_token drafted_token = -1;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
llama_ngram ngram_static; common_ngram ngram_static;
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
} }
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
llama_ngram_cache_part part_static; common_ngram_cache_part part_static;
if (part_static_it != nc_static.end()) { if (part_static_it != nc_static.end()) {
part_static = part_static_it->second; part_static = part_static_it->second;
} }
// cd = context + dynamic // cd = context + dynamic
std::vector<llama_ngram> ngrams_cd; std::vector<common_ngram> ngrams_cd;
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
llama_ngram ngram_cd; common_ngram ngram_cd;
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
} }
@ -196,16 +196,16 @@ void llama_ngram_cache_draft(
} }
} }
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
std::ofstream file_out(filename, std::ios::binary); std::ofstream file_out(filename, std::ios::binary);
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) { for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
const llama_ngram ngram = item.first; const common_ngram ngram = item.first;
llama_ngram_cache_part token_counts = item.second; common_ngram_cache_part token_counts = item.second;
GGML_ASSERT(!token_counts.empty()); GGML_ASSERT(!token_counts.empty());
const int32_t ntokens = token_counts.size(); const int32_t ntokens = token_counts.size();
GGML_ASSERT(ntokens > 0); GGML_ASSERT(ntokens > 0);
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram)); file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t)); file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
for (std::pair<llama_token, int32_t> item2 : token_counts) { for (std::pair<llama_token, int32_t> item2 : token_counts) {
const llama_token token = item2.first; const llama_token token = item2.first;
@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
} }
llama_ngram_cache llama_ngram_cache_load(std::string & filename) { common_ngram_cache common_ngram_cache_load(std::string & filename) {
std::ifstream hashmap_file(filename, std::ios::binary); std::ifstream hashmap_file(filename, std::ios::binary);
if (!hashmap_file) { if (!hashmap_file) {
throw std::ifstream::failure("Unable to open file " + filename); throw std::ifstream::failure("Unable to open file " + filename);
} }
llama_ngram_cache ngram_cache; common_ngram_cache ngram_cache;
llama_ngram ngram; common_ngram ngram;
int32_t ntokens; int32_t ntokens;
llama_token token; llama_token token;
int32_t count; int32_t count;
@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
char * ntokensc = reinterpret_cast<char*>(&ntokens); char * ntokensc = reinterpret_cast<char*>(&ntokens);
char * tokenc = reinterpret_cast<char*>(&token); char * tokenc = reinterpret_cast<char*>(&token);
char * countc = reinterpret_cast<char*>(&count); char * countc = reinterpret_cast<char*>(&count);
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
GGML_ASSERT(ntokens > 0); GGML_ASSERT(ntokens > 0);
llama_ngram_cache_part token_counts; common_ngram_cache_part token_counts;
for (int i = 0; i < ntokens; ++i) { for (int i = 0; i < ntokens; ++i) {
GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(!hashmap_file.eof());
@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
return ngram_cache; return ngram_cache;
} }
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) { for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
const llama_ngram ngram = ngram_part.first; const common_ngram ngram = ngram_part.first;
llama_ngram_cache_part part = ngram_part.second; common_ngram_cache_part part = ngram_part.second;
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
if (part_merged_it == ngram_cache_target.end()) { if (part_merged_it == ngram_cache_target.end()) {
ngram_cache_target.emplace(ngram, part); ngram_cache_target.emplace(ngram, part);
continue; continue;
@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
const int32_t count = token_count.second; const int32_t count = token_count.second;
GGML_ASSERT(count > 0); GGML_ASSERT(count > 0);
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
if (token_count_merged_it == part_merged_it->second.end()) { if (token_count_merged_it == part_merged_it->second.end()) {
part_merged_it->second.emplace(token, count); part_merged_it->second.emplace(token, count);
continue; continue;

View file

@ -12,22 +12,22 @@
// Data structures to map n-grams to empirical token probabilities: // Data structures to map n-grams to empirical token probabilities:
struct llama_ngram { struct common_ngram {
llama_token tokens[LLAMA_NGRAM_MAX]; llama_token tokens[LLAMA_NGRAM_MAX];
llama_ngram() { common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = -1; tokens[i] = -1;
} }
} }
llama_ngram(const llama_token * input, const int ngram_size) { common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : -1; tokens[i] = i < ngram_size ? input[i] : -1;
} }
} }
bool operator==(const llama_ngram & other) const { bool operator==(const common_ngram & other) const {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
if (tokens[i] != other.tokens[i]) { if (tokens[i] != other.tokens[i]) {
return false; return false;
@ -37,28 +37,28 @@ struct llama_ngram {
} }
}; };
struct llama_token_hash_function { struct common_token_hash_function {
size_t operator()(const llama_token token) const { size_t operator()(const llama_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu; return token * 11400714819323198485llu;
} }
}; };
struct llama_ngram_hash_function { struct common_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const { size_t operator()(const common_ngram & ngram) const {
size_t hash = llama_token_hash_function{}(ngram.tokens[0]); size_t hash = common_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= llama_token_hash_function{}(ngram.tokens[i]); hash ^= common_token_hash_function{}(ngram.tokens[i]);
} }
return hash; return hash;
} }
}; };
// token -> number of times token has been seen // token -> number of times token has been seen
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part; typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
// n-gram -> empirical distribution of following tokens // n-gram -> empirical distribution of following tokens
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache; typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
// Update an ngram cache with tokens. // Update an ngram cache with tokens.
@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
// //
// In order to get correct results inp_data can ONLY BE APPENDED TO. // In order to get correct results inp_data can ONLY BE APPENDED TO.
// Changes in the middle need a complete rebuild. // Changes in the middle need a complete rebuild.
void llama_ngram_cache_update( void common_ngram_cache_update(
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress); common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
// Try to draft tokens from ngram caches. // Try to draft tokens from ngram caches.
// inp: the tokens generated so far. // inp: the tokens generated so far.
@ -81,21 +81,21 @@ void llama_ngram_cache_update(
// nc_context: ngram cache based on current context. // nc_context: ngram cache based on current context.
// nc_dynamic: ngram cache based on previous user generations. // nc_dynamic: ngram cache based on previous user generations.
// nc_static: ngram cache generated from a large text corpus, used for validation. // nc_static: ngram cache generated from a large text corpus, used for validation.
void llama_ngram_cache_draft( void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
// Save an ngram cache to a file. // Save an ngram cache to a file.
// ngram_cache: the ngram cache to save. // ngram_cache: the ngram cache to save.
// filename: the path under which to save the ngram cache. // filename: the path under which to save the ngram cache.
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
// Load an ngram cache saved with llama_ngram_cache_save. // Load an ngram cache saved with common_ngram_cache_save.
// filename: the path from which to load the ngram cache. // filename: the path from which to load the ngram cache.
// returns: an ngram cache containing the information saved to filename. // returns: an ngram cache containing the information saved to filename.
llama_ngram_cache llama_ngram_cache_load(std::string & filename); common_ngram_cache common_ngram_cache_load(std::string & filename);
// Merge two ngram caches. // Merge two ngram caches.
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
// ngram_cache_add: the ngram cache to add to ngram_cache_target. // ngram_cache_add: the ngram cache to add to ngram_cache_target.
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);

View file

@ -98,8 +98,8 @@ struct ring_buffer {
std::vector<T> data; std::vector<T> data;
}; };
struct gpt_sampler { struct common_sampler {
gpt_sampler_params params; common_sampler_params params;
struct llama_sampler * grmr; struct llama_sampler * grmr;
struct llama_sampler * chain; struct llama_sampler * chain;
@ -125,7 +125,7 @@ struct gpt_sampler {
} }
}; };
std::string gpt_sampler_params::print() const { std::string common_sampler_params::print() const {
char result[1024]; char result[1024];
snprintf(result, sizeof(result), snprintf(result, sizeof(result),
@ -139,12 +139,12 @@ std::string gpt_sampler_params::print() const {
return std::string(result); return std::string(result);
} }
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
auto * result = new gpt_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams), /* .chain = */ llama_sampler_chain_init(lparams),
@ -175,22 +175,22 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) { for (const auto & cnstr : params.samplers) {
switch (cnstr) { switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break; break;
case GPT_SAMPLER_TYPE_TOP_P: case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break; break;
case GPT_SAMPLER_TYPE_MIN_P: case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break; break;
case GPT_SAMPLER_TYPE_TFS_Z: case COMMON_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
break; break;
case GPT_SAMPLER_TYPE_TYPICAL_P: case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break; break;
case GPT_SAMPLER_TYPE_TEMPERATURE: case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
default: default:
@ -224,7 +224,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
return result; return result;
} }
void gpt_sampler_free(struct gpt_sampler * gsmpl) { void common_sampler_free(struct common_sampler * gsmpl) {
if (gsmpl) { if (gsmpl) {
llama_sampler_free(gsmpl->grmr); llama_sampler_free(gsmpl->grmr);
@ -234,7 +234,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
} }
} }
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) { void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (accept_grammar) { if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token); llama_sampler_accept(gsmpl->grmr, token);
} }
@ -244,14 +244,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
gsmpl->prev.push_back(token); gsmpl->prev.push_back(token);
} }
void gpt_sampler_reset(struct gpt_sampler * gsmpl) { void common_sampler_reset(struct common_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr); llama_sampler_reset(gsmpl->grmr);
llama_sampler_reset(gsmpl->chain); llama_sampler_reset(gsmpl->chain);
} }
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new gpt_sampler { return new common_sampler {
/* .params = */ gsmpl->params, /* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr), /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain), /* .chain = */ llama_sampler_clone(gsmpl->chain),
@ -261,7 +261,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
}; };
} }
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) { void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
// TODO: measure grammar performance // TODO: measure grammar performance
if (gsmpl) { if (gsmpl) {
@ -272,7 +272,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
} }
} }
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
gsmpl->set_logits(ctx, idx); gsmpl->set_logits(ctx, idx);
auto & grmr = gsmpl->grmr; auto & grmr = gsmpl->grmr;
@ -318,21 +318,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
return cur_p.data[cur_p.selected].id; return cur_p.data[cur_p.selected].id;
} }
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
return llama_sampler_get_seed(gsmpl->chain); return llama_sampler_get_seed(gsmpl->chain);
} }
// helpers // helpers
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
return &gsmpl->cur_p; return &gsmpl->cur_p;
} }
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { llama_token common_sampler_last(const struct common_sampler * gsmpl) {
return gsmpl->prev.rat(0); return gsmpl->prev.rat(0);
} }
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { std::string common_sampler_print(const struct common_sampler * gsmpl) {
std::string result = "logits "; std::string result = "logits ";
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
@ -343,7 +343,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
return result; return result;
} }
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) { std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
n = std::min(n, (int) gsmpl->prev.size()); n = std::min(n, (int) gsmpl->prev.size());
if (n <= 0) { if (n <= 0) {
@ -358,63 +358,63 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen"); GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
result += llama_token_to_piece(ctx_main, id); result += common_token_to_piece(ctx_main, id);
} }
return result; return result;
} }
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) { char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
switch (cnstr) { switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return 'k'; case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
case GPT_SAMPLER_TYPE_TFS_Z: return 'f'; case COMMON_SAMPLER_TYPE_TFS_Z: return 'f';
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y'; case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
case GPT_SAMPLER_TYPE_TOP_P: return 'p'; case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
case GPT_SAMPLER_TYPE_MIN_P: return 'm'; case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
default : return '?'; default : return '?';
} }
} }
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) { std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
switch (cnstr) { switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return "top_k"; case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z"; case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z";
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
case GPT_SAMPLER_TYPE_TOP_P: return "top_p"; case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
case GPT_SAMPLER_TYPE_MIN_P: return "min_p"; case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
default : return ""; default : return "";
} }
} }
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) { std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map { std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
{ "top_k", GPT_SAMPLER_TYPE_TOP_K }, { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top_p", GPT_SAMPLER_TYPE_TOP_P }, { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P }, { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", GPT_SAMPLER_TYPE_MIN_P }, { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z }, { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
}; };
// since samplers names are written multiple ways // since samplers names are written multiple ways
// make it ready for both system names and input names // make it ready for both system names and input names
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map { std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
{ "top-k", GPT_SAMPLER_TYPE_TOP_K }, { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top-p", GPT_SAMPLER_TYPE_TOP_P }, { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P }, { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P }, { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P }, { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P }, { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P }, { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", GPT_SAMPLER_TYPE_MIN_P }, { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z }, { "tfs-z", COMMON_SAMPLER_TYPE_TFS_Z },
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z }, { "tfs", COMMON_SAMPLER_TYPE_TFS_Z },
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE }, { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
}; };
std::vector<gpt_sampler_type> samplers; std::vector<common_sampler_type> samplers;
samplers.reserve(names.size()); samplers.reserve(names.size());
for (const auto & name : names) { for (const auto & name : names) {
@ -434,17 +434,17 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
return samplers; return samplers;
} }
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) { std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
std::unordered_map<char, gpt_sampler_type> sampler_name_map = { std::unordered_map<char, common_sampler_type> sampler_name_map = {
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE } { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
}; };
std::vector<gpt_sampler_type> samplers; std::vector<common_sampler_type> samplers;
samplers.reserve(chars.size()); samplers.reserve(chars.size());
for (const auto & c : chars) { for (const auto & c : chars) {

View file

@ -7,7 +7,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
// gpt_sampler extends llama_sampler with additional functionality: // common_sampler extends llama_sampler with additional functionality:
// //
// - grammar support // - grammar support
// - custom sampler logic based on the parameters // - custom sampler logic based on the parameters
@ -23,30 +23,30 @@
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
// grammar constraints are applied to the full vocabulary and the token is resampled. // grammar constraints are applied to the full vocabulary and the token is resampled.
// //
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can // The common_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library. // be moved into the core llama library.
// //
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens. // For convenience, the common_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens. // This can be used to access the probabilities of the rest of the non-sampled tokens.
// //
// TODO: measure grammar performance // TODO: measure grammar performance
// //
struct gpt_sampler; struct common_sampler;
// llama_sampler API overloads // llama_sampler API overloads
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params); struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
void gpt_sampler_free(struct gpt_sampler * gsmpl); void common_sampler_free(struct common_sampler * gsmpl);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar); void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
void gpt_sampler_reset (struct gpt_sampler * gsmpl); void common_sampler_reset (struct common_sampler * gsmpl);
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl); struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// arguments can be nullptr to skip printing // arguments can be nullptr to skip printing
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl); void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
// extended sampling implementation: // extended sampling implementation:
// //
@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// if grammar_first is true, the grammar is applied before the samplers (slower) // if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
// //
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
// helpers // helpers
// access the internal list of current candidate tokens // access the internal list of current candidate tokens
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl); llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
// get the last accepted token // get the last accepted token
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl); llama_token common_sampler_last(const struct common_sampler * gsmpl);
// print the sampler chain into a string // print the sampler chain into a string
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl); std::string common_sampler_print(const struct common_sampler * gsmpl);
// get a string representation of the last accepted tokens // get a string representation of the last accepted tokens
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n); std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr); char common_sampler_type_to_chr(enum common_sampler_type cnstr);
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr); std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names); std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars); std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);

View file

@ -19,8 +19,11 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
## Usage ## Usage
@ -84,3 +87,37 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
``` ```
## Docker With MUSA
Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container.
## Building Docker locally
```bash
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
The defaults are:
- `MUSA_VERSION` set to `rc3.1.0`
The resulting images, are essentially the same as the non-MUSA images:
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
## Usage
After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
```bash
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
```

View file

@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
return 1; return 1;
} }
gpt_init(); common_init();
int is_pp_shared = params.is_pp_shared; int is_pp_shared = params.is_pp_shared;
@ -36,7 +36,7 @@ int main(int argc, char ** argv) {
// initialize the model // initialize the model
llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -45,7 +45,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
llama_context_params ctx_params = llama_context_params_from_gpt_params(params); llama_context_params ctx_params = common_context_params_to_llama(params);
// ensure enough sequences are available // ensure enough sequences are available
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
// warm up // warm up
{ {
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
llama_batch_add(batch, 0, i, { 0 }, false); common_batch_add(batch, 0, i, { 0 }, false);
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@ -122,11 +122,11 @@ int main(int argc, char ** argv) {
continue; continue;
} }
llama_batch_clear(batch); common_batch_clear(batch);
for (int i = 0; i < pp; ++i) { for (int i = 0; i < pp; ++i) {
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
llama_batch_add(batch, 0, i, { j }, false); common_batch_add(batch, 0, i, { j }, false);
} }
} }
batch.logits[batch.n_tokens - 1] = true; batch.logits[batch.n_tokens - 1] = true;
@ -151,10 +151,10 @@ int main(int argc, char ** argv) {
const auto t_tg_start = ggml_time_us(); const auto t_tg_start = ggml_time_us();
for (int i = 0; i < tg; ++i) { for (int i = 0; i < tg; ++i) {
llama_batch_clear(batch); common_batch_clear(batch);
for (int j = 0; j < pl; ++j) { for (int j = 0; j < pl; ++j) {
llama_batch_add(batch, 0, pp + i, { j }, true); common_batch_add(batch, 0, pp + i, { j }, true);
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {

View file

@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
params.n_predict = 32; params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1; return 1;
} }
gpt_init(); common_init();
// number of parallel batches // number of parallel batches
int n_parallel = params.n_parallel; int n_parallel = params.n_parallel;
@ -39,7 +39,7 @@ int main(int argc, char ** argv) {
// initialize the model // initialize the model
llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -51,13 +51,13 @@ int main(int argc, char ** argv) {
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true); tokens_list = common_tokenize(model, params.prompt, true);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
// initialize the context // initialize the context
llama_context_params ctx_params = llama_context_params_from_gpt_params(params); llama_context_params ctx_params = common_context_params_to_llama(params);
ctx_params.n_ctx = n_kv_req; ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_predict, n_parallel); ctx_params.n_batch = std::max(n_predict, n_parallel);
@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
LOG("\n"); LOG("\n");
for (auto id : tokens_list) { for (auto id : tokens_list) {
LOG("%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", common_token_to_piece(ctx, id).c_str());
} }
// create a llama_batch // create a llama_batch
@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
// evaluate the initial prompt // evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); ++i) { for (size_t i = 0; i < tokens_list.size(); ++i) {
llama_batch_add(batch, tokens_list[i], i, seq_ids, false); common_batch_add(batch, tokens_list[i], i, seq_ids, false);
} }
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
@ -123,8 +123,8 @@ int main(int argc, char ** argv) {
decoder_start_token_id = llama_token_bos(model); decoder_start_token_id = llama_token_bos(model);
} }
llama_batch_clear(batch); common_batch_clear(batch);
llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
} }
// llama_decode will output logits only for the last token of the prompt // llama_decode will output logits only for the last token of the prompt
@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
while (n_cur <= n_predict) { while (n_cur <= n_predict) {
// prepare the next batch // prepare the next batch
llama_batch_clear(batch); common_batch_clear(batch);
// sample the next token for each parallel sequence / stream // sample the next token for each parallel sequence / stream
for (int32_t i = 0; i < n_parallel; ++i) { for (int32_t i = 0; i < n_parallel; ++i) {
@ -185,15 +185,15 @@ int main(int argc, char ** argv) {
// if there is only one stream, we print immediately to stdout // if there is only one stream, we print immediately to stdout
if (n_parallel == 1) { if (n_parallel == 1) {
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
} }
streams[i] += llama_token_to_piece(ctx, new_token_id); streams[i] += common_token_to_piece(ctx, new_token_id);
i_batch[i] = batch.n_tokens; i_batch[i] = batch.n_tokens;
// push this new token for next evaluation // push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_cur, { i }, true); common_batch_add(batch, new_token_id, n_cur, { i }, true);
n_decode += 1; n_decode += 1;
} }

View file

@ -872,7 +872,7 @@ static std::string basename(const std::string &path) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_init(); common_init();
struct train_params params = get_default_train_params(); struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) { if (!params_parse(argc, argv, &params)) {

View file

@ -31,7 +31,7 @@ template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
std::string ret; std::string ret;
for (; begin != end; ++begin) { for (; begin != end; ++begin) {
ret += llama_token_to_piece(ctx, *begin); ret += common_token_to_piece(ctx, *begin);
} }
return ret; return ret;
@ -272,8 +272,8 @@ struct tokenized_prompt {
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); tokens_pos = common_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); tokens_neg = common_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_pos, max_seq_len);
padding_seq(ctx, tokens_neg, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len);
@ -281,7 +281,7 @@ struct tokenized_prompt {
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) { void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
// TODO: customize padding token // TODO: customize padding token
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false); std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
llama_token pad_tok = pad_tokens.back(); llama_token pad_tok = pad_tokens.back();
while (tokens.size() < len) { while (tokens.size() < len) {
tokens.push_back(pad_tok); tokens.push_back(pad_tok);
@ -370,7 +370,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
* Load prompt files and completion file. * Load prompt files and completion file.
* Then format each pair of prompt + completion to make an entry. * Then format each pair of prompt + completion to make an entry.
*/ */
static int prepare_entries(gpt_params & params, train_context & ctx_train) { static int prepare_entries(common_params & params, train_context & ctx_train) {
// load prompts // load prompts
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
return 1; return 1;
} }
@ -413,7 +413,7 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model to get hparams // load the model to get hparams
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;

View file

@ -28,7 +28,7 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) { static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size(); size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) { for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true); common_batch_add(batch, tokens[i], i, { seq_id }, true);
} }
} }
@ -74,18 +74,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
} }
float * out = output + embd_pos * n_embd; float * out = output + embd_pos * n_embd;
llama_embd_normalize(embd, out, n_embd, embd_norm); common_embd_normalize(embd, out, n_embd, embd_norm);
} }
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
return 1; return 1;
} }
gpt_init(); common_init();
params.embedding = true; params.embedding = true;
// For non-causal models, batch size must be equal to ubatch size // For non-causal models, batch size must be equal to ubatch size
@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -122,7 +122,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
// split the prompt into lines // split the prompt into lines
@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
// tokenize the prompts and trim // tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs; std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) { for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, true); auto inp = common_tokenize(ctx, prompt, true, true);
if (inp.size() > n_batch) { if (inp.size() > n_batch) {
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch); __func__, (long long int) inp.size(), (long long int) n_batch);
@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int j = 0; j < (int) inputs[i].size(); j++) { for (int j = 0; j < (int) inputs[i].size(); j++) {
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
} }
LOG("\n\n"); LOG("\n\n");
} }
@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
s = 0; s = 0;
llama_batch_clear(batch); common_batch_clear(batch);
} }
// add to batch // add to batch
@ -263,7 +263,7 @@ int main(int argc, char ** argv) {
LOG("\n"); LOG("\n");
for (int i = 0; i < n_prompts; i++) { for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) { for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
LOG("%6.2f ", sim); LOG("%6.2f ", sim);
} }
LOG("%1.10s", prompts[i].c_str()); LOG("%1.10s", prompts[i].c_str());
@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
for (int i = 0;;) { // at least two iteration (n_embd_count > 1) for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
LOG(" ["); LOG(" [");
for (int j = 0;;) { // at least two iteration (n_embd_count > 1) for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
LOG("%6.2f", sim); LOG("%6.2f", sim);
j++; j++;
if (j < n_embd_count) LOG(", "); else break; if (j < n_embd_count) LOG(", "); else break;

View file

@ -126,10 +126,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
return true; return true;
} }
static bool run(llama_context * ctx, const gpt_params & params) { static bool run(llama_context * ctx, const common_params & params) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
LOG_ERR("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
callback_data cb_data; callback_data cb_data;
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1; return 1;
} }
gpt_init(); common_init();
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
params.warmup = false; params.warmup = false;
// init // init
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n"); LOG_INF("\n");
} }

View file

@ -128,7 +128,7 @@ struct lora_merge_ctx {
lora_merge_ctx( lora_merge_ctx(
std::string & base_fname, std::string & base_fname,
std::vector<llama_lora_adapter_info> & lora_files, std::vector<common_lora_adapter_info> & lora_files,
std::string & outfile, std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
return 1; return 1;
} }

View file

@ -11,7 +11,7 @@ static void write_table_header(std::ofstream & file) {
file << "| -------- | ----------- |\n"; file << "| -------- | ----------- |\n";
} }
static void write_table_entry(std::ofstream & file, const llama_arg & opt) { static void write_table_entry(std::ofstream & file, const common_arg & opt) {
file << "| `"; file << "| `";
// args // args
for (const auto & arg : opt.args) { for (const auto & arg : opt.args) {
@ -40,7 +40,7 @@ static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
file << "` | " << md_help << " |\n"; file << "` | " << md_help << " |\n";
} }
static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) { static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
write_table_header(file); write_table_header(file);
for (const auto & opt : opts) { for (const auto & opt : opts) {
write_table_entry(file, *opt); write_table_entry(file, *opt);
@ -50,12 +50,12 @@ static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
static void export_md(std::string fname, llama_example ex) { static void export_md(std::string fname, llama_example ex) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
gpt_params params; common_params params;
auto ctx_arg = gpt_params_parser_init(params, ex); auto ctx_arg = common_params_parser_init(params, ex);
std::vector<llama_arg *> common_options; std::vector<common_arg *> common_options;
std::vector<llama_arg *> sparam_options; std::vector<common_arg *> sparam_options;
std::vector<llama_arg *> specific_options; std::vector<common_arg *> specific_options;
for (auto & opt : ctx_arg.options) { for (auto & opt : ctx_arg.options) {
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
if (opt.is_sparam) { if (opt.is_sparam) {

View file

@ -15,11 +15,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
for (uint64_t i = 0; i < sentences.size(); i++) { for (uint64_t i = 0; i < sentences.size(); i++) {
llama_batch_clear(batch); common_batch_clear(batch);
const std::string input_string = instruction + sentences[i]; const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false); std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
const int32_t n_toks = inputs.size(); const int32_t n_toks = inputs.size();
@ -28,7 +28,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
// inputs.push_back(llama_token_eos(model)); // inputs.push_back(llama_token_eos(model));
// we want to ignore instruction tokens for mean pooling // we want to ignore instruction tokens for mean pooling
const int32_t n_inst = llama_tokenize(model, instruction, true, false).size(); const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
#ifdef GRIT_DEBUG #ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample // debug tokens - should be matching as referenced in the GritLM sample
@ -40,7 +40,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
// add input to batch (this increments n_tokens) // add input to batch (this increments n_tokens)
for (int32_t j = 0; j < n_toks; j++) { for (int32_t j = 0; j < n_toks; j++) {
llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
} }
// clear previous kv_cache values (irrelevant for embeddings) // clear previous kv_cache values (irrelevant for embeddings)
@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
} }
std::vector<float> emb_norm(emb_unorm.size()); std::vector<float> emb_norm(emb_unorm.size());
llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
result.push_back(emb_norm); result.push_back(emb_norm);
#ifdef GRIT_DEBUG #ifdef GRIT_DEBUG
@ -105,16 +105,16 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true); std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
int32_t i_current_token = 0; int32_t i_current_token = 0;
while (true) { while (true) {
llama_batch_clear(bat); common_batch_clear(bat);
{ {
const int32_t n_inputs = inputs.size(); const int32_t n_inputs = inputs.size();
for (int32_t i = 0; i < n_inputs; i++) { for (int32_t i = 0; i < n_inputs; i++) {
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
} }
} }
inputs.clear(); inputs.clear();
@ -127,7 +127,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
break; break;
} }
std::string piece = llama_token_to_piece(ctx, token); std::string piece = common_token_to_piece(ctx, token);
if (stream) { if (stream) {
std::printf("%s", piece.c_str()); std::printf("%s", piece.c_str());
std::fflush(stdout); std::fflush(stdout);
@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) {
} }
int main(int argc, char * argv[]) { int main(int argc, char * argv[]) {
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1; return 1;
} }
gpt_init(); common_init();
llama_model_params mparams = llama_model_params_from_gpt_params(params); llama_model_params mparams = common_model_params_to_llama(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context_params cparams = common_context_params_to_llama(params);
llama_backend_init(); llama_backend_init();
@ -199,10 +199,10 @@ int main(int argc, char * argv[]) {
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);

View file

@ -37,13 +37,13 @@ struct Stats {
class IMatrixCollector { class IMatrixCollector {
public: public:
IMatrixCollector() = default; IMatrixCollector() = default;
void set_params(gpt_params params) { m_params = std::move(params); } void set_params(common_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const; void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name); bool load_imatrix(const char * file_name);
private: private:
std::unordered_map<std::string, Stats> m_stats; std::unordered_map<std::string, Stats> m_stats;
gpt_params m_params; common_params m_params;
std::mutex m_mutex; std::mutex m_mutex;
int m_last_call = 0; int m_last_call = 0;
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
@ -428,7 +428,7 @@ static void process_logits(
} }
} }
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -436,7 +436,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@ -568,17 +568,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
params.n_ctx = 512; params.n_ctx = 512;
params.logits_all = true; params.logits_all = true;
params.escape = false; params.escape = false;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
return 1; return 1;
} }
gpt_init(); common_init();
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
@ -607,7 +607,7 @@ int main(int argc, char ** argv) {
params.warmup = false; params.warmup = false;
// init // init
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -625,7 +625,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
if (!compute_imatrix(ctx, params)) { if (!compute_imatrix(ctx, params)) {

View file

@ -35,8 +35,8 @@
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
static gpt_sampler ** g_smpl; static common_sampler ** g_smpl;
static gpt_params * g_params; static common_params * g_params;
static std::vector<llama_token> * g_input_tokens; static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss; static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens; static std::vector<llama_token> * g_output_tokens;
@ -44,7 +44,7 @@ static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false; static bool is_interacting = false;
static void write_logfile( static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const common_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output, const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens const std::vector<llama_token> & output_tokens
) { ) {
@ -95,12 +95,12 @@ static void sigint_handler(int signo) {
} else { } else {
console::cleanup(); console::cleanup();
LOG("\n"); LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl); common_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
// make sure all logs are flushed // make sure all logs are flushed
LOG("Interrupted by user\n"); LOG("Interrupted by user\n");
gpt_log_pause(gpt_log_main()); common_log_pause(common_log_main());
_exit(130); _exit(130);
} }
@ -109,14 +109,14 @@ static void sigint_handler(int signo) {
#endif #endif
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
g_params = &params; g_params = &params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
return 1; return 1;
} }
gpt_init(); common_init();
auto & sparams = params.sparams; auto & sparams = params.sparams;
@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
llama_model * model = nullptr; llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
gpt_sampler * smpl = nullptr; common_sampler * smpl = nullptr;
g_model = &model; g_model = &model;
g_ctx = &ctx; g_ctx = &ctx;
@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
@ -195,15 +195,15 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
const bool add_bos = llama_add_bos_token(model); const bool add_bos = llama_add_bos_token(model);
GGML_ASSERT(!llama_add_eos_token(model)); GGML_ASSERT(!llama_add_eos_token(model));
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end; std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
GGML_ASSERT(llama_token_prefix(model) >= 0); GGML_ASSERT(llama_token_prefix(model) >= 0);
GGML_ASSERT(llama_token_suffix(model) >= 0); GGML_ASSERT(llama_token_suffix(model) >= 0);
@ -257,13 +257,13 @@ int main(int argc, char ** argv) {
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
} }
if (params.n_keep > 0) { if (params.n_keep > 0) {
LOG_INF("%s: static prompt based on n_keep: '", __func__); LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
} }
LOG_CNT("'\n"); LOG_CNT("'\n");
} }
@ -298,11 +298,11 @@ int main(int argc, char ** argv) {
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
} }
} }
smpl = gpt_sampler_init(model, sparams); smpl = common_sampler_init(model, sparams);
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
@ -411,9 +411,9 @@ int main(int argc, char ** argv) {
embd.clear(); embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = gpt_sampler_sample(smpl, ctx, -1); const llama_token id = common_sampler_sample(smpl, ctx, -1);
gpt_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
@ -434,7 +434,7 @@ int main(int argc, char ** argv) {
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
gpt_sampler_accept(smpl, embd_inp[n_consumed], false); common_sampler_accept(smpl, embd_inp[n_consumed], false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -446,7 +446,7 @@ int main(int argc, char ** argv) {
// display text // display text
if (input_echo) { if (input_echo) {
for (auto id : embd) { for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = common_token_to_piece(ctx, id);
LOG("%s", token_str.c_str()); LOG("%s", token_str.c_str());
if (embd.size() > 1) { if (embd.size() > 1) {
@ -465,10 +465,10 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode // deal with eot token in infill mode
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) { if (is_interacting && !params.interactive_first) {
// print an eot token // print an eot token
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
} }
LOG("\n"); LOG("\n");
console::set_display(console::user_input); console::set_display(console::user_input);
@ -505,8 +505,8 @@ int main(int argc, char ** argv) {
} }
// tokenize new prefix and suffix // tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
@ -529,7 +529,7 @@ int main(int argc, char ** argv) {
is_interacting = false; is_interacting = false;
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
LOG_DBG("found EOS token\n"); LOG_DBG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -579,7 +579,7 @@ int main(int argc, char ** argv) {
const size_t original_size = embd_inp.size(); const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false); const auto line_inp = common_tokenize(ctx, buffer, false);
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@ -587,7 +587,7 @@ int main(int argc, char ** argv) {
for (size_t i = original_size; i < embd_inp.size(); ++i) { for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i]; const llama_token token = embd_inp[i];
output_tokens.push_back(token); output_tokens.push_back(token);
output_ss << llama_token_to_piece(ctx, token); output_ss << common_token_to_piece(ctx, token);
} }
n_remain -= line_inp.size(); n_remain -= line_inp.size();
@ -601,7 +601,7 @@ int main(int argc, char ** argv) {
if (n_past > 0) { if (n_past > 0) {
if (is_interacting) { if (is_interacting) {
gpt_sampler_reset(smpl); common_sampler_reset(smpl);
} }
is_interacting = false; is_interacting = false;
} }
@ -620,17 +620,17 @@ int main(int argc, char ** argv) {
} }
} }
if (!params.interactive && n_remain <= 0) { if (!params.interactive && n_remain <= 0) {
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
} }
LOG("\n"); LOG("\n");
gpt_perf_print(ctx, smpl); common_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
gpt_sampler_free(smpl); common_sampler_free(smpl);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -304,9 +304,9 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
#ifdef GGML_USE_RPC if (llama_supports_rpc()) {
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
#endif }
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@ -497,14 +497,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = string_split<int>(argv[i], split_delim); auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
#ifdef GGML_USE_RPC } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.rpc_servers.push_back(argv[i]); params.rpc_servers.push_back(argv[i]);
#endif
} else if (arg == "-sm" || arg == "--split-mode") { } else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;

View file

@ -18,6 +18,7 @@ android {
} }
externalNativeBuild { externalNativeBuild {
cmake { cmake {
arguments += "-DLLAMA_BUILD_COMMON=ON"
arguments += "-DCMAKE_BUILD_TYPE=Release" arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf() cppFlags += listOf()
arguments += listOf() arguments += listOf()

View file

@ -186,11 +186,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
for (nri = 0; nri < nr; nri++) { for (nri = 0; nri < nr; nri++) {
LOGi("Benchmark prompt processing (pp)"); LOGi("Benchmark prompt processing (pp)");
llama_batch_clear(*batch); common_batch_clear(*batch);
const int n_tokens = pp; const int n_tokens = pp;
for (i = 0; i < n_tokens; i++) { for (i = 0; i < n_tokens; i++) {
llama_batch_add(*batch, 0, i, { 0 }, false); common_batch_add(*batch, 0, i, { 0 }, false);
} }
batch->logits[batch->n_tokens - 1] = true; batch->logits[batch->n_tokens - 1] = true;
@ -210,9 +210,9 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
const auto t_tg_start = ggml_time_us(); const auto t_tg_start = ggml_time_us();
for (i = 0; i < tg; i++) { for (i = 0; i < tg; i++) {
llama_batch_clear(*batch); common_batch_clear(*batch);
for (j = 0; j < pl; j++) { for (j = 0; j < pl; j++) {
llama_batch_add(*batch, 0, i, { j }, true); common_batch_add(*batch, 0, i, { j }, true);
} }
LOGi("llama_decode() text generation: %d", i); LOGi("llama_decode() text generation: %d", i);
@ -357,7 +357,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
const auto context = reinterpret_cast<llama_context *>(context_pointer); const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer); const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto tokens_list = llama_tokenize(context, text, 1); const auto tokens_list = common_tokenize(context, text, 1);
auto n_ctx = llama_n_ctx(context); auto n_ctx = llama_n_ctx(context);
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@ -369,14 +369,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
} }
for (auto id : tokens_list) { for (auto id : tokens_list) {
LOGi("%s", llama_token_to_piece(context, id).c_str()); LOGi("%s", common_token_to_piece(context, id).c_str());
} }
llama_batch_clear(*batch); common_batch_clear(*batch);
// evaluate the initial prompt // evaluate the initial prompt
for (auto i = 0; i < tokens_list.size(); i++) { for (auto i = 0; i < tokens_list.size(); i++) {
llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
} }
// llama_decode will output logits only for the last token of the prompt // llama_decode will output logits only for the last token of the prompt
@ -419,7 +419,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
return nullptr; return nullptr;
} }
auto new_token_chars = llama_token_to_piece(context, new_token_id); auto new_token_chars = common_token_to_piece(context, new_token_id);
cached_token_chars += new_token_chars; cached_token_chars += new_token_chars;
jstring new_token = nullptr; jstring new_token = nullptr;
@ -431,8 +431,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
new_token = env->NewStringUTF(""); new_token = env->NewStringUTF("");
} }
llama_batch_clear(*batch); common_batch_clear(*batch);
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); common_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
env->CallVoidMethod(intvar_ncur, la_int_var_inc); env->CallVoidMethod(intvar_ncur, la_int_var_inc);

View file

@ -1,135 +0,0 @@
" Requires an already running llama.cpp server
" To install either copy or symlink to ~/.vim/autoload/llama.vim
" Then start with either :call llama#doLlamaGen(),
" or add a keybind to your vimrc such as
" nnoremap Z :call llama#doLlamaGen()<CR>
" Similarly, you could add an insert mode keybind with
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
"
" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
" let g:llama_api_url = "192.168.1.10:8080"
" llama_overrides can also be set through buffer/window scopes. For instance
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
" Could be added to your .vimrc to automatically set a lower temperature when
" editing a python script
" Additionally, an override dict can be stored at the top of a file
" !*{"stop": ["User:"]}
" Could be added to the start of your chatlog.txt to set the stopping token
" These parameter dicts are merged together from lowest to highest priority:
" server default -> g:llama_overrides -> w:llama_overrides ->
" b:llama_overrides -> in file (!*) overrides
"
" Sublists (like logit_bias and stop) are overridden, not merged
" Example override:
" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
if !exists("g:llama_api_url")
let g:llama_api_url= "127.0.0.1:8080"
endif
if !exists("g:llama_overrides")
let g:llama_overrides = {}
endif
const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
let s:linedict = {}
func s:callbackHandler(bufn, channel, msg)
if len(a:msg) < 3
return
elseif a:msg[0] == "d"
let l:msg = a:msg[6:-1]
else
let l:msg = a:msg
endif
let l:decoded_msg = json_decode(l:msg)
let l:newtext = split(l:decoded_msg['content'], "\n", 1)
if len(l:newtext) > 0
call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
else
echo "nothing genned"
endif
if len(newtext) > 1
let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
endif
if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
echo "Finished generation"
endif
endfunction
func llama#doLlamaGen()
if exists("b:job")
if job_status(b:job) == "run"
call job_stop(b:job)
return
endif
endif
let l:cbuffer = bufnr("%")
let s:linedict[l:cbuffer] = line('$')
let l:buflines = getbufline(l:cbuffer, 1, 1000)
let l:querydata = copy(s:querydata)
call extend(l:querydata, g:llama_overrides)
if exists("w:llama_overrides")
call extend(l:querydata, w:llama_overrides)
endif
if exists("b:llama_overrides")
call extend(l:querydata, b:llama_overrides)
endif
if l:buflines[0][0:1] == '!*'
let l:userdata = json_decode(l:buflines[0][2:-1])
call extend(l:querydata, l:userdata)
let l:buflines = l:buflines[1:-1]
endif
let l:querydata.prompt = join(l:buflines, "\n")
let l:curlcommand = copy(s:curlcommand)
if exists("g:llama_api_key")
call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
endif
let l:curlcommand[2] = json_encode(l:querydata)
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
endfunction
" Echos the tokkenization of the provided string , or cursor to end of word
" Onus is placed on the user to include the preceding space
func llama#tokenizeWord(...)
if (a:0 > 0)
let l:input = a:1
else
exe "normal \"*ye"
let l:input = @*
endif
let l:querydata = {"content": l:input}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
endfunction
func s:tokenizeWordCallback(plaintext, channel, msg)
echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
endfunction
" Echos the token count of the entire buffer (or provided string)
" Example usage :echo llama#tokenCount()
func llama#tokenCount(...)
if (a:0 > 0)
let l:buflines = a:1
else
let l:buflines = getline(1,1000)
if l:buflines[0][0:1] == '!*'
let l:buflines = l:buflines[1:-1]
endif
let l:buflines = join(l:buflines, "\n")
endif
let l:querydata = {"content": l:buflines}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
endfunction
func s:tokenCountCallback(channel, msg)
let resp = json_decode(a:msg)
echo len(resp.tokens)
endfunction

View file

@ -37,21 +37,21 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str; std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past); eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true; return true;
} }
static const char * sample(struct gpt_sampler * smpl, static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama, struct llama_context * ctx_llama,
int * n_past) { int * n_past) {
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
gpt_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
static std::string ret; static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = llama_token_to_piece(ctx_llama, id); ret = common_token_to_piece(ctx_llama, id);
} }
eval_id(ctx_llama, id, n_past); eval_id(ctx_llama, id, n_past);
return ret.c_str(); return ret.c_str();
@ -120,7 +120,7 @@ static void print_usage(int, char ** argv) {
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
// load and preprocess the image // load and preprocess the image
llava_image_embed * embed = NULL; llava_image_embed * embed = NULL;
@ -146,7 +146,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
return embed; return embed;
} }
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) { static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
int n_past = 0; int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
@ -159,16 +159,16 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_INF("system_prompt: %s\n", system_prompt.c_str()); LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
LOG_INF("user_prompt: %s\n", user_prompt.c_str()); LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} else { } else {
@ -176,9 +176,9 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:"; user_prompt = prompt + "\nASSISTANT:";
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} }
@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
LOG("\n"); LOG("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
if (!smpl) { if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1); exit(1);
@ -211,15 +211,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
fflush(stdout); fflush(stdout);
} }
gpt_sampler_free(smpl); common_sampler_free(smpl);
LOG("\n"); LOG("\n");
} }
static struct llama_model * llava_init(gpt_params * params) { static struct llama_model * llava_init(common_params * params) {
llama_backend_init(); llama_backend_init();
llama_numa_init(params->numa); llama_numa_init(params->numa);
llama_model_params model_params = llama_model_params_from_gpt_params(*params); llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
@ -229,7 +229,7 @@ static struct llama_model * llava_init(gpt_params * params) {
return model; return model;
} }
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str(); const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt; auto prompt = params->prompt;
@ -240,7 +240,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
@ -272,13 +272,13 @@ static void llava_free(struct llava_context * ctx_llava) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
return 1; return 1;
} }
gpt_init(); common_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv); print_usage(argc, argv);

View file

@ -25,11 +25,11 @@ static void show_additional_info(int /*argc*/, char ** argv) {
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llama_model * llava_init(gpt_params * params) { static struct llama_model * llava_init(common_params * params) {
llama_backend_init(); llama_backend_init();
llama_numa_init(params->numa); llama_numa_init(params->numa);
llama_model_params model_params = llama_model_params_from_gpt_params(*params); llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
@ -39,13 +39,13 @@ static struct llama_model * llava_init(gpt_params * params) {
return model; return model;
} }
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
auto prompt = params->prompt; auto prompt = params->prompt;
if (prompt.empty()) { if (prompt.empty()) {
prompt = "describe the image in detail."; prompt = "describe the image in detail.";
} }
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = common_context_params_to_llama(*params);
if (params->n_ctx < 2048) { if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048" // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
@ -79,7 +79,7 @@ static void llava_free(struct llava_context * ctx_llava) {
llama_backend_free(); llama_backend_free();
} }
static struct clip_ctx * clip_init_context(gpt_params * params) { static struct clip_ctx * clip_init_context(common_params * params) {
const char * clip_path = params->mmproj.c_str(); const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt; auto prompt = params->prompt;
@ -114,7 +114,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str; std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
} }
@ -129,7 +129,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
llava_image_embed_free(slice_embed); llava_image_embed_free(slice_embed);
} }
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
std::string system_prompt; std::string system_prompt;
int idx = 0; int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
@ -162,22 +162,22 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
LOG_INF("%s: image token past: %d\n", __func__, n_past); LOG_INF("%s: image token past: %d\n", __func__, n_past);
} }
static const char * sample(struct gpt_sampler * smpl, static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama, struct llama_context * ctx_llama,
int * n_past) { int * n_past) {
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
gpt_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
static std::string ret; static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = llama_token_to_piece(ctx_llama, id); ret = common_token_to_piece(ctx_llama, id);
} }
eval_id(ctx_llama, id, n_past); eval_id(ctx_llama, id, n_past);
return ret.c_str(); return ret.c_str();
} }
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
auto * ctx_clip = clip_init_context(params); auto * ctx_clip = clip_init_context(params);
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) { if (!embeds) {
@ -213,7 +213,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
return ctx_llava; return ctx_llava;
} }
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){ static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt; std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) { if (!is_first) {
@ -237,11 +237,11 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
LOG_INF("\n"); LOG_INF("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
return smpl; return smpl;
} }
static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){ static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
return tmp; return tmp;
@ -250,13 +250,13 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampl
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1; return 1;
} }
gpt_init(); common_init();
if (params.mmproj.empty() || (params.image.empty())) { if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv); show_additional_info(argc, argv);
@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
fflush(stdout); fflush(stdout);
} }
gpt_sampler_free(smpl); common_sampler_free(smpl);
}else { }else {
while (true) { while (true) {
LOG("<user>"); LOG("<user>");
@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
if (strstr(response.c_str(), "<user>")) break; // minicpm-v if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout); fflush(stdout);
} }
gpt_sampler_free(smpl); common_sampler_free(smpl);
} }
} }
printf("\n"); printf("\n");

View file

@ -37,13 +37,13 @@ struct ngram_container {
}; };
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1; return 1;
} }
gpt_init(); common_init();
const int W = 15; // lookahead window const int W = 15; // lookahead window
const int N = 5; // n-gram size const int N = 5; // n-gram size
@ -56,7 +56,7 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the target model // load the target model
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp; std::vector<llama_token> inp;
std::vector<llama_token> all; std::vector<llama_token> all;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = common_tokenize(ctx, params.prompt, true, true);
all = inp; all = inp;
const int max_context_size = llama_n_ctx(ctx); const int max_context_size = llama_n_ctx(ctx);
@ -79,7 +79,7 @@ int main(int argc, char ** argv) {
LOG("\n\n"); LOG("\n\n");
for (auto id : inp) { for (auto id : inp) {
LOG("%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", common_token_to_piece(ctx, id).c_str());
} }
fflush(stderr); fflush(stderr);
@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
// target model sampling context // target model sampling context
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); struct common_sampler * smpl = common_sampler_init(model, params.sparams);
// verification n-grams // verification n-grams
std::vector<ngram_data> ngrams_cur(G); std::vector<ngram_data> ngrams_cur(G);
@ -156,12 +156,12 @@ int main(int argc, char ** argv) {
// sample first token // sample first token
{ {
id = gpt_sampler_sample(smpl, ctx, 0); id = common_sampler_sample(smpl, ctx, 0);
gpt_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
{ {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = common_token_to_piece(ctx, id);
LOG("%s", token_str.c_str()); LOG("%s", token_str.c_str());
fflush(stdout); fflush(stdout);
@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
llama_kv_cache_dump_view_seqs(kvc_view, 40); common_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
@ -201,10 +201,10 @@ int main(int argc, char ** argv) {
// V V V V V V // V V V V V V
// id // id
{ {
llama_batch_clear(batch); common_batch_clear(batch);
// current token - first token of the first level // current token - first token of the first level
llama_batch_add(batch, id, n_past, seq_id_all, true); common_batch_add(batch, id, n_past, seq_id_all, true);
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
{ {
@ -229,7 +229,7 @@ int main(int argc, char ** argv) {
ngrams_cur[g].tokens [j + 1] = t; ngrams_cur[g].tokens [j + 1] = t;
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens; ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
} }
} }
} }
@ -241,13 +241,13 @@ int main(int argc, char ** argv) {
seq_id_look[j] = i + j + 1; seq_id_look[j] = i + j + 1;
} }
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
} }
// fill the rest of the levels // fill the rest of the levels
for (int j = 1; j < N - 1; j++) { for (int j = 1; j < N - 1; j++) {
for (int i = 0; i < W; i++) { for (int i = 0; i < W; i++) {
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
} }
} }
} }
@ -281,13 +281,13 @@ int main(int argc, char ** argv) {
} }
// sample the next token // sample the next token
id = gpt_sampler_sample(smpl, ctx, i_batch); id = common_sampler_sample(smpl, ctx, i_batch);
gpt_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
// print // print
{ {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = common_token_to_piece(ctx, id);
if (v == 0) { if (v == 0) {
LOG("%s", token_str.c_str()); LOG("%s", token_str.c_str());
@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
// print known n-grams starting with token id (debug) // print known n-grams starting with token id (debug)
if (0 && v == 0) { if (0 && v == 0) {
if (ngrams_observed.cnt[id] > 0) { if (ngrams_observed.cnt[id] > 0) {
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
} }
for (int i = 0; i < ngrams_observed.cnt[id]; i++) { for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
const int idx = id*(N - 1)*G + i*(N - 1); const int idx = id*(N - 1)*G + i*(N - 1);
for (int j = 0; j < N - 1; j++) { for (int j = 0; j < N - 1; j++) {
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
LOG("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
@ -358,7 +358,7 @@ int main(int argc, char ** argv) {
if (v == 0) { if (v == 0) {
// sample from the last level // sample from the last level
for (int i = 0; i < W; i++) { for (int i = 0; i < W; i++) {
tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
} }
} else { } else {
for (int i = 0; i < W; i++) { for (int i = 0; i < W; i++) {
@ -466,9 +466,9 @@ int main(int argc, char ** argv) {
LOG_INF("n_accept = %d\n", n_accept); LOG_INF("n_accept = %d\n", n_accept);
LOG_INF("\n"); LOG_INF("\n");
gpt_perf_print(ctx, smpl); common_perf_print(ctx, smpl);
gpt_sampler_free(smpl); common_sampler_free(smpl);
llama_kv_cache_view_free(&kvc_view); llama_kv_cache_view_free(&kvc_view);

View file

@ -12,9 +12,9 @@
#include <vector> #include <vector>
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1; return 1;
} }
@ -23,7 +23,7 @@ int main(int argc, char ** argv){
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -31,15 +31,15 @@ int main(int argc, char ** argv){
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = common_tokenize(ctx, params.prompt, true, true);
fprintf(stderr, "%s: tokenization done\n", __func__); fprintf(stderr, "%s: tokenization done\n", __func__);
llama_ngram_cache ngram_cache; common_ngram_cache ngram_cache;
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
return 0; return 0;
} }

View file

@ -33,15 +33,15 @@ int main(int argc, char ** argv){
} }
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str()); fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]); common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
for (size_t i = 1; i < args.size()-1; ++i) { for (size_t i = 1; i < args.size()-1; ++i) {
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str()); fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]); common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache); common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
} }
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str()); fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
llama_ngram_cache_save(ngram_cache_merged, args.back()); common_ngram_cache_save(ngram_cache_merged, args.back());
} }

View file

@ -13,13 +13,13 @@
#include <vector> #include <vector>
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1; return 1;
} }
gpt_init(); common_init();
const int n_draft = params.n_draft; const int n_draft = params.n_draft;
@ -28,18 +28,18 @@ int main(int argc, char ** argv){
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = common_tokenize(ctx, params.prompt, true, true);
llama_ngram_cache ngram_cache_context; common_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic; common_ngram_cache ngram_cache_dynamic;
llama_ngram_cache ngram_cache_static; common_ngram_cache ngram_cache_static;
int64_t t_draft_flat_us = 0; int64_t t_draft_flat_us = 0;
int64_t t_draft_us = 0; int64_t t_draft_us = 0;
@ -48,7 +48,7 @@ int main(int argc, char ** argv){
if (!params.lookup_cache_static.empty()) { if (!params.lookup_cache_static.empty()) {
try { try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) { } catch (std::ifstream::failure const &) {
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1); exit(1);
@ -57,7 +57,7 @@ int main(int argc, char ** argv){
if (!params.lookup_cache_dynamic.empty()) { if (!params.lookup_cache_dynamic.empty()) {
try { try {
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
} }
@ -86,7 +86,7 @@ int main(int argc, char ** argv){
{ {
const int64_t t_start_draft_us = ggml_time_us(); const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
t_draft_us += ggml_time_us() - t_start_draft_us; t_draft_us += ggml_time_us() - t_start_draft_us;
} }
@ -105,7 +105,7 @@ int main(int argc, char ** argv){
{ {
const int64_t t_start_draft_us = ggml_time_us(); const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us; t_draft_us += ggml_time_us() - t_start_draft_us;
} }
} }
@ -115,7 +115,7 @@ int main(int argc, char ** argv){
pseudo_output.push_back(inp_slice[pseudo_output.size()]); pseudo_output.push_back(inp_slice[pseudo_output.size()]);
{ {
const int64_t t_start_draft_us = ggml_time_us(); const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us; t_draft_us += ggml_time_us() - t_start_draft_us;
} }
} }
@ -133,7 +133,7 @@ int main(int argc, char ** argv){
} }
// After each chunk, update the dynamic ngram cache with the context ngram cache: // After each chunk, update the dynamic ngram cache with the context ngram cache:
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
ngram_cache_context.clear(); ngram_cache_context.clear();
} }

View file

@ -13,13 +13,13 @@
#include <vector> #include <vector>
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1; return 1;
} }
gpt_init(); common_init();
// max. number of additional tokens to draft if match is found // max. number of additional tokens to draft if match is found
const int n_draft = params.n_draft; const int n_draft = params.n_draft;
@ -31,29 +31,29 @@ int main(int argc, char ** argv){
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = common_tokenize(ctx, params.prompt, true, true);
llama_ngram_cache ngram_cache_context; common_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic; common_ngram_cache ngram_cache_dynamic;
llama_ngram_cache ngram_cache_static; common_ngram_cache ngram_cache_static;
int64_t t_draft_flat_us = 0; int64_t t_draft_flat_us = 0;
int64_t t_draft_us = 0; int64_t t_draft_us = 0;
{ {
// Fill up context ngram cache with tokens from user input: // Fill up context ngram cache with tokens from user input:
const int64_t t_start_draft_us = ggml_time_us(); const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
if (!params.lookup_cache_static.empty()) { if (!params.lookup_cache_static.empty()) {
try { try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) { } catch (std::ifstream::failure const &) {
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1); exit(1);
@ -62,7 +62,7 @@ int main(int argc, char ** argv){
if (!params.lookup_cache_dynamic.empty()) { if (!params.lookup_cache_dynamic.empty()) {
try { try {
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
} }
@ -80,7 +80,7 @@ int main(int argc, char ** argv){
LOG("\n\n"); LOG("\n\n");
for (auto id : inp) { for (auto id : inp) {
LOG("%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", common_token_to_piece(ctx, id).c_str());
} }
fflush(stderr); fflush(stderr);
@ -102,7 +102,7 @@ int main(int argc, char ** argv){
bool has_eos = false; bool has_eos = false;
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); struct common_sampler * smpl = common_sampler_init(model, params.sparams);
std::vector<llama_token> draft; std::vector<llama_token> draft;
@ -117,7 +117,7 @@ int main(int argc, char ** argv){
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
llama_kv_cache_dump_view_seqs(kvc_view, 40); common_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// print current draft sequence // print current draft sequence
@ -126,11 +126,11 @@ int main(int argc, char ** argv){
int i_dft = 0; int i_dft = 0;
while (true) { while (true) {
// sample from the target model // sample from the target model
llama_token id = gpt_sampler_sample(smpl, ctx, i_dft); llama_token id = common_sampler_sample(smpl, ctx, i_dft);
gpt_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = common_token_to_piece(ctx, id);
if (!params.use_color) { if (!params.use_color) {
LOG("%s", token_str.c_str()); LOG("%s", token_str.c_str());
@ -152,7 +152,7 @@ int main(int argc, char ** argv){
{ {
// Update context ngram cache with the newly accepted token: // Update context ngram cache with the newly accepted token:
const int64_t t_start_draft_us = ggml_time_us(); const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us; t_draft_us += ggml_time_us() - t_start_draft_us;
} }
@ -178,7 +178,7 @@ int main(int argc, char ** argv){
{ {
// Update context ngram cache with the newly accepted token: // Update context ngram cache with the newly accepted token:
const int64_t t_start_draft_us = ggml_time_us(); const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us; t_draft_us += ggml_time_us() - t_start_draft_us;
} }
break; break;
@ -192,18 +192,18 @@ int main(int argc, char ** argv){
// clean the cache of draft tokens that weren't accepted // clean the cache of draft tokens that weren't accepted
llama_kv_cache_seq_rm(ctx, 0, n_past, -1); llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
llama_batch_clear(batch_tgt); common_batch_clear(batch_tgt);
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
// Draft already contains a single token sampled from the model: // Draft already contains a single token sampled from the model:
GGML_ASSERT(draft.size() == 1); GGML_ASSERT(draft.size() == 1);
GGML_ASSERT(draft[0] == inp.back()); GGML_ASSERT(draft[0] == inp.back());
const int64_t t_start_draft_us = ggml_time_us(); const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
for (size_t i = 1; i < draft.size(); ++i) { for (size_t i = 1; i < draft.size(); ++i) {
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
} }
t_draft_us += ggml_time_us() - t_start_draft_us; t_draft_us += ggml_time_us() - t_start_draft_us;
@ -218,8 +218,8 @@ int main(int argc, char ** argv){
auto t_dec_end = ggml_time_us(); auto t_dec_end = ggml_time_us();
// Update dynamic ngram cache with context ngram cache and save it to disk: // Update dynamic ngram cache with context ngram cache and save it to disk:
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
LOG("\n\n"); LOG("\n\n");
@ -237,9 +237,9 @@ int main(int argc, char ** argv){
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_INF("\ntarget:\n\n"); LOG_INF("\ntarget:\n\n");
gpt_perf_print(ctx, smpl); common_perf_print(ctx, smpl);
gpt_sampler_free(smpl); common_sampler_free(smpl);
llama_batch_free(batch_tgt); llama_batch_free(batch_tgt);

View file

@ -33,8 +33,8 @@
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
static gpt_sampler ** g_smpl; static common_sampler ** g_smpl;
static gpt_params * g_params; static common_params * g_params;
static std::vector<llama_token> * g_input_tokens; static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss; static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens; static std::vector<llama_token> * g_output_tokens;
@ -63,7 +63,7 @@ static bool file_is_empty(const std::string & path) {
} }
static void write_logfile( static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const common_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output, const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens const std::vector<llama_token> & output_tokens
) { ) {
@ -114,12 +114,12 @@ static void sigint_handler(int signo) {
} else { } else {
console::cleanup(); console::cleanup();
LOG("\n"); LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl); common_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
// make sure all logs are flushed // make sure all logs are flushed
LOG("Interrupted by user\n"); LOG("Interrupted by user\n");
gpt_log_pause(gpt_log_main()); common_log_pause(common_log_main());
_exit(130); _exit(130);
} }
@ -127,22 +127,22 @@ static void sigint_handler(int signo) {
} }
#endif #endif
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) { static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
llama_chat_msg new_msg{role, content}; common_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content}); chat_msgs.push_back({role, content});
LOG_DBG("formatted: '%s'\n", formatted.c_str()); LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted; return formatted;
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
g_params = &params; g_params = &params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
return 1; return 1;
} }
gpt_init(); common_init();
auto & sparams = params.sparams; auto & sparams = params.sparams;
@ -187,9 +187,9 @@ int main(int argc, char ** argv) {
llama_model * model = nullptr; llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
gpt_sampler * smpl = nullptr; common_sampler * smpl = nullptr;
std::vector<llama_chat_msg> chat_msgs; std::vector<common_chat_msg> chat_msgs;
g_model = &model; g_model = &model;
g_ctx = &ctx; g_ctx = &ctx;
@ -197,7 +197,7 @@ int main(int argc, char ** argv) {
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
// print chat template example in conversation mode // print chat template example in conversation mode
if (params.conversation) { if (params.conversation) {
if (params.enable_chat_template) { if (params.enable_chat_template) {
LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
} else { } else {
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
} }
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n"); LOG_INF("\n");
} }
@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
: params.prompt; : params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG_DBG("tokenize the prompt\n"); LOG_DBG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, prompt, true, true); embd_inp = common_tokenize(ctx, prompt, true, true);
} else { } else {
LOG_DBG("use session tokens\n"); LOG_DBG("use session tokens\n");
embd_inp = session_tokens; embd_inp = session_tokens;
@ -379,13 +379,13 @@ int main(int argc, char ** argv) {
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
} }
if (params.n_keep > add_bos) { if (params.n_keep > add_bos) {
LOG_INF("%s: static prompt based on n_keep: '", __func__); LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
} }
LOG_CNT("'\n"); LOG_CNT("'\n");
} }
@ -415,9 +415,9 @@ int main(int argc, char ** argv) {
for (const auto & antiprompt : params.antiprompt) { for (const auto & antiprompt : params.antiprompt) {
LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); auto tmp = common_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
@ -430,9 +430,9 @@ int main(int argc, char ** argv) {
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
@ -440,23 +440,23 @@ int main(int argc, char ** argv) {
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
} }
smpl = gpt_sampler_init(model, sparams); smpl = common_sampler_init(model, sparams);
if (!smpl) { if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
return 1; return 1;
} }
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
@ -521,7 +521,7 @@ int main(int argc, char ** argv) {
antiprompt_ids.reserve(params.antiprompt.size()); antiprompt_ids.reserve(params.antiprompt.size());
for (const std::string & antiprompt : params.antiprompt) { for (const std::string & antiprompt : params.antiprompt) {
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
} }
if (llama_model_has_encoder(model)) { if (llama_model_has_encoder(model)) {
@ -679,9 +679,9 @@ int main(int argc, char ** argv) {
LOG_DBG("saved session to %s\n", path_session.c_str()); LOG_DBG("saved session to %s\n", path_session.c_str());
} }
const llama_token id = gpt_sampler_sample(smpl, ctx, -1); const llama_token id = common_sampler_sample(smpl, ctx, -1);
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true); common_sampler_accept(smpl, id, /* accept_grammar= */ true);
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
@ -702,7 +702,7 @@ int main(int argc, char ** argv) {
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -714,7 +714,7 @@ int main(int argc, char ** argv) {
// display text // display text
if (input_echo && display) { if (input_echo && display) {
for (auto id : embd) { for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special); const std::string token_str = common_token_to_piece(ctx, id, params.special);
// Console/Stream Output // Console/Stream Output
LOG("%s", token_str.c_str()); LOG("%s", token_str.c_str());
@ -743,7 +743,7 @@ int main(int argc, char ** argv) {
// check for reverse prompt in the last n_prev tokens // check for reverse prompt in the last n_prev tokens
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
const int n_prev = 32; const int n_prev = 32;
const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev); const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
is_antiprompt = false; is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output. // Check if each of the reverse prompts appears at the end of the output.
@ -765,7 +765,7 @@ int main(int argc, char ** argv) {
} }
// check for reverse prompt using special tokens // check for reverse prompt using special tokens
llama_token last_token = gpt_sampler_last(smpl); llama_token last_token = common_sampler_last(smpl);
for (std::vector<llama_token> ids : antiprompt_ids) { for (std::vector<llama_token> ids : antiprompt_ids) {
if (ids.size() == 1 && last_token == ids[0]) { if (ids.size() == 1 && last_token == ids[0]) {
if (params.interactive) { if (params.interactive) {
@ -782,13 +782,13 @@ int main(int argc, char ** argv) {
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { if (llama_token_is_eog(model, common_sampler_last(smpl))) {
LOG_DBG("found an EOG token\n"); LOG_DBG("found an EOG token\n");
if (params.interactive) { if (params.interactive) {
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt // tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true; is_antiprompt = true;
} }
@ -803,8 +803,8 @@ int main(int argc, char ** argv) {
// if current token is not EOG, we add it to current assistant message // if current token is not EOG, we add it to current assistant message
if (params.conversation) { if (params.conversation) {
const auto id = gpt_sampler_last(smpl); const auto id = common_sampler_last(smpl);
assistant_ss << llama_token_to_piece(ctx, id, false); assistant_ss << common_token_to_piece(ctx, id, false);
} }
if (n_past > 0 && is_interacting) { if (n_past > 0 && is_interacting) {
@ -862,9 +862,9 @@ int main(int argc, char ** argv) {
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer); : std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
@ -882,7 +882,7 @@ int main(int argc, char ** argv) {
for (size_t i = original_size; i < embd_inp.size(); ++i) { for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i]; const llama_token token = embd_inp[i];
output_tokens.push_back(token); output_tokens.push_back(token);
output_ss << llama_token_to_piece(ctx, token); output_ss << common_token_to_piece(ctx, token);
} }
// reset assistant message // reset assistant message
@ -899,7 +899,7 @@ int main(int argc, char ** argv) {
if (n_past > 0) { if (n_past > 0) {
if (is_interacting) { if (is_interacting) {
gpt_sampler_reset(smpl); common_sampler_reset(smpl);
} }
is_interacting = false; is_interacting = false;
} }
@ -925,10 +925,10 @@ int main(int argc, char ** argv) {
} }
LOG_INF("\n\n"); LOG_INF("\n\n");
gpt_perf_print(ctx, smpl); common_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
gpt_sampler_free(smpl); common_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View file

@ -54,7 +54,7 @@ static std::vector<std::string> k_prompts = {
struct client { struct client {
~client() { ~client() {
if (smpl) { if (smpl) {
gpt_sampler_free(smpl); common_sampler_free(smpl);
} }
} }
@ -75,7 +75,7 @@ struct client {
std::string prompt; std::string prompt;
std::string response; std::string response;
struct gpt_sampler * smpl = nullptr; struct common_sampler * smpl = nullptr;
}; };
static void print_date_time() { static void print_date_time() {
@ -103,13 +103,13 @@ static std::vector<std::string> split_string(const std::string& input, char deli
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
srand(1234); srand(1234);
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1; return 1;
} }
gpt_init(); common_init();
// number of simultaneous "clients" to simulate // number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel; const int32_t n_clients = params.n_parallel;
@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the target model // load the target model
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -160,11 +160,11 @@ int main(int argc, char ** argv) {
for (size_t i = 0; i < clients.size(); ++i) { for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i]; auto & client = clients[i];
client.id = i; client.id = i;
client.smpl = gpt_sampler_init(model, params.sparams); client.smpl = common_sampler_init(model, params.sparams);
} }
std::vector<llama_token> tokens_system; std::vector<llama_token> tokens_system;
tokens_system = ::llama_tokenize(ctx, k_system, true); tokens_system = common_tokenize(ctx, k_system, true);
const int32_t n_tokens_system = tokens_system.size(); const int32_t n_tokens_system = tokens_system.size();
llama_seq_id g_seq_id = 0; llama_seq_id g_seq_id = 0;
@ -189,7 +189,7 @@ int main(int argc, char ** argv) {
LOG_INF("%s: Evaluating the system prompt ...\n", __func__); LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
for (int32_t i = 0; i < n_tokens_system; ++i) { for (int32_t i = 0; i < n_tokens_system; ++i) {
llama_batch_add(batch, tokens_system[i], i, { 0 }, false); common_batch_add(batch, tokens_system[i], i, { 0 }, false);
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
@ -210,10 +210,10 @@ int main(int argc, char ** argv) {
while (true) { while (true) {
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
llama_kv_cache_dump_view_seqs(kvc_view, 40); common_kv_cache_dump_view_seqs(kvc_view, 40);
} }
llama_batch_clear(batch); common_batch_clear(batch);
// decode any currently ongoing sequences // decode any currently ongoing sequences
for (auto & client : clients) { for (auto & client : clients) {
@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
client.i_batch = batch.n_tokens; client.i_batch = batch.n_tokens;
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
client.n_decoded += 1; client.n_decoded += 1;
} }
@ -252,14 +252,14 @@ int main(int argc, char ** argv) {
client.prompt = client.input + "\nAssistant:"; client.prompt = client.input + "\nAssistant:";
client.response = ""; client.response = "";
gpt_sampler_reset(client.smpl); common_sampler_reset(client.smpl);
// do not prepend BOS because we have a system prompt! // do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt; std::vector<llama_token> tokens_prompt;
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); tokens_prompt = common_tokenize(ctx, client.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) { for (size_t i = 0; i < tokens_prompt.size(); ++i) {
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
} }
// extract the logits only for the last token // extract the logits only for the last token
@ -340,9 +340,9 @@ int main(int argc, char ** argv) {
//printf("client %d, seq %d, token %d, pos %d, batch %d\n", //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i); const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
gpt_sampler_accept(client.smpl, id, true); common_sampler_accept(client.smpl, id, true);
if (client.n_decoded == 1) { if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients // start measuring generation time after the first token to make sure all concurrent clients
@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
client.t_start_gen = ggml_time_us(); client.t_start_gen = ggml_time_us();
} }
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = common_token_to_piece(ctx, id);
client.response += token_str; client.response += token_str;
client.sampled = id; client.sampled = id;

View file

@ -15,17 +15,17 @@ static void print_usage(int, char ** argv) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
params.n_junk = 250; params.n_junk = 250;
params.n_keep = 32; params.n_keep = 32;
params.i_pos = -1; params.i_pos = -1;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
return 1; return 1;
} }
gpt_init(); common_init();
int n_junk = params.n_junk; int n_junk = params.n_junk;
int n_keep = params.n_keep; int n_keep = params.n_keep;
@ -61,7 +61,7 @@ int main(int argc, char ** argv) {
// initialize the model // initialize the model
llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -72,7 +72,7 @@ int main(int argc, char ** argv) {
// initialize the context // initialize the context
llama_context_params ctx_params = llama_context_params_from_gpt_params(params); llama_context_params ctx_params = common_context_params_to_llama(params);
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
@ -92,10 +92,10 @@ int main(int argc, char ** argv) {
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true); tokens_list = common_tokenize(ctx, params.prompt, true);
// tokenize the prefix and use it as a sink // tokenize the prefix and use it as a sink
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size();
const int n_tokens_all = tokens_list.size(); const int n_tokens_all = tokens_list.size();
@ -137,10 +137,10 @@ int main(int argc, char ** argv) {
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
} }
llama_batch_clear(batch); common_batch_clear(batch);
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
} }
if (i + n_batch >= n_tokens_all) { if (i + n_batch >= n_tokens_all) {
@ -171,10 +171,10 @@ int main(int argc, char ** argv) {
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
llama_batch_clear(batch); common_batch_clear(batch);
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
} }
if (i + n_batch >= n_tokens_all) { if (i + n_batch >= n_tokens_all) {
@ -229,15 +229,15 @@ int main(int argc, char ** argv) {
break; break;
} }
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
n_decode += 1; n_decode += 1;
// prepare the next batch // prepare the next batch
llama_batch_clear(batch); common_batch_clear(batch);
// push this new token for next evaluation // push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_past++, { 0 }, true); common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
} }
n_cur += 1; n_cur += 1;

View file

@ -35,7 +35,7 @@ struct results_log_softmax {
}; };
static void write_logfile( static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const common_params & params, const llama_model * model,
const struct results_perplexity & results const struct results_perplexity & results
) { ) {
if (params.logdir.empty()) { if (params.logdir.empty()) {
@ -169,7 +169,7 @@ static void process_logits(
break; break;
} }
lock.unlock(); lock.unlock();
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
const double v = -results.log_softmax; const double v = -results.log_softmax;
local_nll += v; local_nll += v;
local_nll2 += v*v; local_nll2 += v*v;
@ -203,7 +203,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
break; break;
} }
lock.unlock(); lock.unlock();
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]); const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
local_nll += v; local_nll += v;
local_nll2 += v*v; local_nll2 += v*v;
} }
@ -281,7 +281,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
kld.sum_kld += sum; kld.sum_kld += sum;
kld.sum_kld2 += sum*sum; kld.sum_kld2 += sum*sum;
++kld.count; ++kld.count;
if (imax == imax_base) ++kld.n_same_top; if (imax == imax_base) {
++kld.n_same_top;
}
const float p_base = expf(-nll_base); const float p_base = expf(-nll_base);
const float p = expf(-nll); const float p = expf(-nll);
@ -323,7 +325,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
break; break;
} }
lock.unlock(); lock.unlock();
std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
kld_values[i] = (float)v.first; kld_values[i] = (float)v.first;
p_diff_values[i] = v.second; p_diff_values[i] = v.second;
} }
@ -337,7 +339,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
} }
} }
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
@ -348,7 +350,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
LOG_INF("%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -383,9 +385,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
int count = 0; int count = 0;
double nll = 0.0; double nll = 0.0;
@ -424,8 +427,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
} }
const auto batch_logits = llama_get_logits(ctx); const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
if (j == 0) { if (j == 0) {
tokens[batch_start] = token_org; tokens[batch_start] = token_org;
@ -447,11 +450,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones. // Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits( const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab, logits.begin() + size_t(j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab); logits.begin() + size_t(j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]]; const float prob = softmax(tok_logits)[tokens[start + j + 1]];
logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]]; logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
@ -472,7 +474,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
return {tokens, std::exp(nll / count), logit_history, prob_history}; return {tokens, std::exp(nll / count), logit_history, prob_history};
} }
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) { static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params); return perplexity_v2(ctx, params);
} }
@ -500,7 +502,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@ -521,9 +523,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
int count = 0; int count = 0;
double nll = 0.0; double nll = 0.0;
double nll2 = 0.0; double nll2 = 0.0;
@ -538,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::vector<float> logits; std::vector<float> logits;
if (num_batches > 1) { if (num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab); logits.reserve(size_t(n_ctx) * n_vocab);
} }
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
@ -620,7 +623,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
if (num_batches > 1 && n_outputs > 0) { if (num_batches > 1 && n_outputs > 0) {
const auto * batch_logits = llama_get_logits(ctx); const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab); logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
} }
} }
@ -661,7 +664,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
} else { } else {
double av = nll/count; double av = nll/count;
double av2 = nll2/count - av*av; double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1)); if (av2 > 0) {
av2 = sqrt(av2/(count-1));
}
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
} }
} }
@ -686,10 +691,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
return {tokens, ppl, logit_history, prob_history}; return {tokens, ppl, logit_history, prob_history};
} }
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) { static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
int prev_outputs = 0; int prev_outputs = 0;
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
llama_batch batch_view = { llama_batch batch_view = {
n_tokens, n_tokens,
@ -713,7 +718,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
n_outputs += batch_view.logits[i] != 0; n_outputs += batch_view.logits[i] != 0;
} }
memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float)); memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
prev_outputs += n_outputs; prev_outputs += n_outputs;
} }
@ -728,7 +733,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
if (eval_results.size() != eval_pairs.size()) { if (eval_results.size() != eval_pairs.size()) {
eval_results.resize(eval_pairs.size()); eval_results.resize(eval_pairs.size());
} }
if (eval_pairs.empty()) return; if (eval_pairs.empty()) {
return;
}
size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size()); size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
@ -736,11 +743,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
float local_logprobs[K_TOKEN_CHUNK]; float local_logprobs[K_TOKEN_CHUNK];
while (true) { while (true) {
size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
if (first >= eval_results.size()) break; if (first >= eval_results.size()) {
size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); break;
}
const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
for (size_t i = first; i < last; ++i) { for (size_t i = first; i < last; ++i) {
auto logits = batch_logits + eval_pairs[i].first * n_vocab; const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
float max_logit = logits[0]; float max_logit = logits[0];
for (int j = 1; j < n_vocab; ++j) { for (int j = 1; j < n_vocab; ++j) {
max_logit = std::max(max_logit, logits[j]); max_logit = std::max(max_logit, logits[j]);
@ -763,7 +772,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
} }
} }
static void hellaswag_score(llama_context * ctx, const gpt_params & params) { static void hellaswag_score(llama_context * ctx, const common_params & params) {
// Calculates hellaswag score (acc_norm) from prompt // Calculates hellaswag score (acc_norm) from prompt
// //
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@ -844,7 +853,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
for (size_t j = 0; j < 4; j++) { for (size_t j = 0; j < 4; j++) {
hs_cur.ending[j] = prompt_lines[idx*6+2+j]; hs_cur.ending[j] = prompt_lines[idx*6+2+j];
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
} }
// determine the common prefix of the endings // determine the common prefix of the endings
@ -877,10 +886,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
double acc = 0.0f; double acc = 0.0f;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int max_tasks_per_batch = 32; const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@ -888,7 +898,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<float> tok_logits(n_vocab); std::vector<float> tok_logits(n_vocab);
// TODO: this could be made smaller; it's currently the worst-case size // TODO: this could be made smaller; it's currently the worst-case size
std::vector<float> batch_logits(n_vocab*n_ctx); std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
std::vector<std::pair<size_t, llama_token>> eval_pairs; std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results; std::vector<float> eval_results;
@ -900,7 +910,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
size_t i1 = i0; size_t i1 = i0;
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
llama_batch_clear(batch); common_batch_clear(batch);
// batch as much tasks as possible into the available context // batch as much tasks as possible into the available context
// each task has 4 unique sequence ids - one for each ending // each task has 4 unique sequence ids - one for each ending
@ -916,7 +926,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
for (size_t i = 0; i < hs_cur.common_prefix; ++i) { for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
} }
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
n_logits += 1; n_logits += 1;
@ -926,7 +936,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// TODO: don't evaluate the last token of each sequence // TODO: don't evaluate the last token of each sequence
for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
const bool needs_logits = i < seq_tokens_size - 1; const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits; n_logits += needs_logits;
} }
} }
@ -975,7 +985,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
auto & hs_cur = hs_data[i]; auto & hs_cur = hs_data[i];
// get the logits of the last token of the common prefix // get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float)); std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits); const auto first_probs = softmax(tok_logits);
@ -1102,7 +1112,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2 * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
* *
*/ */
static void winogrande_score(llama_context * ctx, const gpt_params & params) { static void winogrande_score(llama_context * ctx, const common_params & params) {
constexpr int k_min_trailing_ctx = 3; constexpr int k_min_trailing_ctx = 3;
@ -1136,8 +1146,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
LOG_INF("%s : tokenizing selected tasks\n", __func__); LOG_INF("%s : tokenizing selected tasks\n", __func__);
for (auto & task : data) { for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
task.common_prefix = 0; task.common_prefix = 0;
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@ -1152,16 +1162,17 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[0].size() - task.common_prefix +
task.seq_tokens[1].size() - task.common_prefix; task.seq_tokens[1].size() - task.common_prefix;
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
} }
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__); LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int max_tasks_per_batch = 128; const int max_tasks_per_batch = 128;
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@ -1169,7 +1180,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
std::vector<float> tok_logits(n_vocab); std::vector<float> tok_logits(n_vocab);
// TODO: this could be made smaller; it's currently the worst-case size // TODO: this could be made smaller; it's currently the worst-case size
std::vector<float> batch_logits(n_vocab*n_ctx); std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
std::vector<std::pair<size_t, llama_token>> eval_pairs; std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results; std::vector<float> eval_results;
@ -1184,7 +1195,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
size_t i1 = i0; size_t i1 = i0;
size_t i_logits = 0; size_t i_logits = 0;
llama_batch_clear(batch); common_batch_clear(batch);
while (n_cur + (int) data[i1].required_tokens <= n_ctx) { while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
int n_logits = 0; int n_logits = 0;
@ -1194,7 +1205,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
} }
for (size_t i = 0; i < data[i1].common_prefix; ++i) { for (size_t i = 0; i < data[i1].common_prefix; ++i) {
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
} }
batch.logits[batch.n_tokens - 1] = true; batch.logits[batch.n_tokens - 1] = true;
n_logits += 1; n_logits += 1;
@ -1202,7 +1213,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
for (int s = 0; s < 2; ++s) { for (int s = 0; s < 2; ++s) {
// TODO: end before the last token, no need to predict past the end of the sequences // TODO: end before the last token, no need to predict past the end of the sequences
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
n_logits += 1; n_logits += 1;
} }
} }
@ -1359,7 +1370,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
} }
return false; return false;
} }
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
} }
auto min_len = task.seq_tokens.front().size(); auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) { for (auto& seq : task.seq_tokens) {
@ -1403,7 +1414,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
// git@hf.co:datasets/Stevross/mmlu // git@hf.co:datasets/Stevross/mmlu
// https://huggingface.co/datasets/truthful_qa // https://huggingface.co/datasets/truthful_qa
// //
static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { static void multiple_choice_score(llama_context * ctx, const common_params & params) {
std::istringstream strstream(params.prompt); std::istringstream strstream(params.prompt);
uint32_t n_task; uint32_t n_task;
@ -1509,17 +1520,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
LOG("\ntask\tacc_norm\n"); LOG("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int max_tasks_per_batch = 32; const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
std::vector<float> tok_logits(n_vocab); std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx); std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
std::vector<std::pair<size_t, llama_token>> eval_pairs; std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results; std::vector<float> eval_results;
@ -1536,7 +1548,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
size_t i1 = i0; size_t i1 = i0;
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
llama_batch_clear(batch); common_batch_clear(batch);
// batch as much tasks as possible into the available context // batch as much tasks as possible into the available context
// each task has 4 unique sequence ids - one for each ending // each task has 4 unique sequence ids - one for each ending
@ -1559,7 +1571,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
for (size_t i = 0; i < cur_task.common_prefix; ++i) { for (size_t i = 0; i < cur_task.common_prefix; ++i) {
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
} }
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
n_logits += 1; n_logits += 1;
@ -1569,7 +1581,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// TODO: don't evaluate the last token of each sequence // TODO: don't evaluate the last token of each sequence
for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
const bool needs_logits = i < seq_tokens_size - 1; const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits; n_logits += needs_logits;
} }
} }
@ -1627,7 +1639,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix); //LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
// get the logits of the last token of the common prefix // get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits); const auto first_probs = softmax(tok_logits);
@ -1683,7 +1695,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
LOG_INF("\n"); LOG_INF("\n");
} }
static void kl_divergence(llama_context * ctx, const gpt_params & params) { static void kl_divergence(llama_context * ctx, const common_params & params) {
if (params.logits_file.empty()) { if (params.logits_file.empty()) {
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return; return;
@ -1709,7 +1721,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx); __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
} }
int n_vocab, n_chunk; int n_vocab;
int n_chunk;
in.read((char *)&n_vocab, sizeof(n_vocab)); in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk)); in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) { if (in.fail()) {
@ -1720,7 +1733,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
} }
std::vector<llama_token> tokens(n_ctx * n_chunk); std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return; return;
@ -1737,7 +1750,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> logits; std::vector<float> logits;
if (num_batches > 1) { if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab); logits.reserve(size_t(n_ctx) * n_vocab);
} }
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1); std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -1801,7 +1814,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (num_batches > 1) { if (num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx); const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
} }
} }
@ -1822,7 +1835,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
const int first = n_ctx/2; const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
p_diff_ptr += n_ctx - 1 - first; p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first;
@ -1955,17 +1968,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
params.n_ctx = 512; params.n_ctx = 512;
params.logits_all = true; params.logits_all = true;
params.escape = false; params.escape = false;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
return 1; return 1;
} }
gpt_init(); common_init();
const int32_t n_ctx = params.n_ctx; const int32_t n_ctx = params.n_ctx;
@ -2004,7 +2017,7 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -2023,7 +2036,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
struct results_perplexity results; struct results_perplexity results;

View file

@ -77,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) { static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size(); size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) { for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true); common_batch_add(batch, tokens[i], i, { seq_id }, true);
} }
} }
@ -107,18 +107,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
} }
float * out = output + batch.seq_id[i][0] * n_embd; float * out = output + batch.seq_id[i][0] * n_embd;
llama_embd_normalize(embd, out, n_embd); common_embd_normalize(embd, out, n_embd);
} }
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
return 1; return 1;
} }
gpt_init(); common_init();
// For BERT models, batch size must be equal to ubatch size // For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch; params.n_ubatch = params.n_batch;
@ -149,7 +149,7 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model // load the model
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
// max batch size // max batch size
@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
// tokenize the prompts and trim // tokenize the prompts and trim
for (auto & chunk : chunks) { for (auto & chunk : chunks) {
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); auto inp = common_tokenize(ctx, chunk.textdata, true, false);
if (inp.size() > n_batch) { if (inp.size() > n_batch) {
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch); __func__, (long long int) inp.size(), (long long int) n_batch);
@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
} }
LOG_INF("\n\n"); LOG_INF("\n\n");
} }
@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
if (batch.n_tokens + n_toks > n_batch) { if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd; float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd); batch_decode(ctx, batch, out, s, n_embd);
llama_batch_clear(batch); common_batch_clear(batch);
p += s; p += s;
s = 0; s = 0;
} }
@ -260,20 +260,20 @@ int main(int argc, char ** argv) {
while (true) { while (true) {
LOG("Enter query: "); LOG("Enter query: ");
std::getline(std::cin, query); std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true); std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
batch_add_seq(query_batch, query_tokens, 0); batch_add_seq(query_batch, query_tokens, 0);
std::vector<float> query_emb(n_embd, 0); std::vector<float> query_emb(n_embd, 0);
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
llama_batch_clear(query_batch); common_batch_clear(query_batch);
// compute cosine similarities // compute cosine similarities
{ {
std::vector<std::pair<int, float>> similarities; std::vector<std::pair<int, float>> similarities;
for (int i = 0; i < n_chunks; i++) { for (int i = 0; i < n_chunks; i++) {
float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
similarities.push_back(std::make_pair(i, sim)); similarities.push_back(std::make_pair(i, sim));
} }

View file

@ -151,7 +151,7 @@ int main(int argc, char * argv[]) {
get_backend_memory(&free_mem, &total_mem); get_backend_memory(&free_mem, &total_mem);
} }
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024)); printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend); ggml_backend_free(backend);
return 0; return 0;
} }

View file

@ -6,12 +6,12 @@
#include <cstdio> #include <cstdio>
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
params.prompt = "The quick brown fox"; params.prompt = "The quick brown fox";
params.sparams.seed = 1234; params.sparams.seed = 1234;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1; return 1;
} }
@ -28,7 +28,7 @@ int main(int argc, char ** argv) {
std::string result2; std::string result2;
// init // init
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
// tokenize prompt // tokenize prompt
auto tokens = llama_tokenize(ctx, params.prompt, true); auto tokens = common_tokenize(ctx, params.prompt, true);
// evaluate prompt // evaluate prompt
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
@ -72,7 +72,7 @@ int main(int argc, char ** argv) {
for (auto i = 0; i < params.n_predict; i++) { for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sampler_sample(smpl, ctx, -1); auto next_token = llama_sampler_sample(smpl, ctx, -1);
auto next_token_str = llama_token_to_piece(ctx, next_token); auto next_token_str = common_token_to_piece(ctx, next_token);
printf("%s", next_token_str.c_str()); printf("%s", next_token_str.c_str());
result0 += next_token_str; result0 += next_token_str;
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
llama_free(ctx); llama_free(ctx);
// make new context // make new context
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
llama_sampler * smpl2 = llama_sampler_chain_init(sparams); llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
@ -128,7 +128,7 @@ int main(int argc, char ** argv) {
// second run // second run
for (auto i = 0; i < params.n_predict; i++) { for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sampler_sample(smpl2, ctx2, -1); auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
auto next_token_str = llama_token_to_piece(ctx2, next_token); auto next_token_str = common_token_to_piece(ctx2, next_token);
printf("%s", next_token_str.c_str()); printf("%s", next_token_str.c_str());
result1 += next_token_str; result1 += next_token_str;
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
} }
// make new context // make new context
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
llama_sampler * smpl3 = llama_sampler_chain_init(sparams); llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
@ -216,7 +216,7 @@ int main(int argc, char ** argv) {
// third run with seq 1 instead of 0 // third run with seq 1 instead of 0
for (auto i = 0; i < params.n_predict; i++) { for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sampler_sample(smpl3, ctx3, -1); auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
auto next_token_str = llama_token_to_piece(ctx3, next_token); auto next_token_str = common_token_to_piece(ctx3, next_token);
printf("%s", next_token_str.c_str()); printf("%s", next_token_str.c_str());
result2 += next_token_str; result2 += next_token_str;

View file

@ -188,8 +188,8 @@ struct server_slot {
// sampling // sampling
json json_schema; json json_schema;
struct gpt_sampler_params sparams; struct common_sampler_params sparams;
struct gpt_sampler * smpl = nullptr; struct common_sampler * smpl = nullptr;
llama_token sampled; llama_token sampled;
@ -231,7 +231,7 @@ struct server_slot {
generated_token_probs.clear(); generated_token_probs.clear();
} }
bool has_budget(gpt_params &global_params) { bool has_budget(common_params &global_params) {
if (params.n_predict == -1 && global_params.n_predict == -1) { if (params.n_predict == -1 && global_params.n_predict == -1) {
return true; // limitless return true; // limitless
} }
@ -611,9 +611,9 @@ struct server_response {
struct server_context { struct server_context {
llama_model * model = nullptr; llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
std::vector<llama_lora_adapter_container> loras; std::vector<common_lora_adapter_container> loras;
gpt_params params; common_params params;
llama_batch batch = {}; llama_batch batch = {};
@ -655,20 +655,20 @@ struct server_context {
// Clear any sampling context // Clear any sampling context
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.smpl != nullptr) { if (slot.smpl != nullptr) {
gpt_sampler_free(slot.smpl); common_sampler_free(slot.smpl);
} }
} }
llama_batch_free(batch); llama_batch_free(batch);
} }
bool load_model(const gpt_params & params_) { bool load_model(const common_params & params_) {
params = params_; params = params_;
// dedicate one sequence to the system prompt // dedicate one sequence to the system prompt
params.n_parallel += 1; params.n_parallel += 1;
llama_init_result llama_init = llama_init_from_gpt_params(params); common_init_result llama_init = common_init_from_params(params);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
@ -771,10 +771,10 @@ struct server_context {
std::vector<llama_token> p; std::vector<llama_token> p;
if (first) { if (first) {
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); p = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
first = false; first = false;
} else { } else {
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
} }
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
@ -788,7 +788,7 @@ struct server_context {
} }
} else { } else {
auto s = json_prompt.template get<std::string>(); auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); prompt_tokens = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
} }
return prompt_tokens; return prompt_tokens;
@ -999,7 +999,7 @@ struct server_context {
slot.sparams.logit_bias.push_back({tok, bias}); slot.sparams.logit_bias.push_back({tok, bias});
} }
} else if (el[0].is_string()) { } else if (el[0].is_string()) {
auto toks = llama_tokenize(model, el[0].get<std::string>(), false); auto toks = common_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks) { for (auto tok : toks) {
slot.sparams.logit_bias.push_back({tok, bias}); slot.sparams.logit_bias.push_back({tok, bias});
} }
@ -1031,7 +1031,7 @@ struct server_context {
sampler_names.emplace_back(name); sampler_names.emplace_back(name);
} }
} }
slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
} else { } else {
slot.sparams.samplers = default_sparams.samplers; slot.sparams.samplers = default_sparams.samplers;
} }
@ -1039,10 +1039,10 @@ struct server_context {
{ {
if (slot.smpl != nullptr) { if (slot.smpl != nullptr) {
gpt_sampler_free(slot.smpl); common_sampler_free(slot.smpl);
} }
slot.smpl = gpt_sampler_init(model, slot.sparams); slot.smpl = common_sampler_init(model, slot.sparams);
if (slot.smpl == nullptr) { if (slot.smpl == nullptr) {
// for now, the only error that may happen here is invalid grammar // for now, the only error that may happen here is invalid grammar
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@ -1073,7 +1073,7 @@ struct server_context {
system_tokens.clear(); system_tokens.clear();
if (!system_prompt.empty()) { if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, true); system_tokens = common_tokenize(ctx, system_prompt, true);
const int32_t n_batch = llama_n_batch(ctx); const int32_t n_batch = llama_n_batch(ctx);
const int32_t n_tokens_prompt = system_tokens.size(); const int32_t n_tokens_prompt = system_tokens.size();
@ -1081,10 +1081,10 @@ struct server_context {
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) { for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i); const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
llama_batch_clear(batch); common_batch_clear(batch);
for (int32_t j = 0; j < n_tokens; ++j) { for (int32_t j = 0; j < n_tokens; ++j) {
llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
@ -1113,7 +1113,7 @@ struct server_context {
bool process_token(completion_token_output & result, server_slot & slot) { bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special); const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
@ -1224,7 +1224,7 @@ struct server_context {
std::vector<std::string> samplers; std::vector<std::string> samplers;
samplers.reserve(slot.sparams.samplers.size()); samplers.reserve(slot.sparams.samplers.size());
for (const auto & sampler : slot.sparams.samplers) { for (const auto & sampler : slot.sparams.samplers) {
samplers.emplace_back(gpt_sampler_type_to_str(sampler)); samplers.emplace_back(common_sampler_type_to_str(sampler));
} }
return json { return json {
@ -1232,7 +1232,7 @@ struct server_context {
{"n_predict", slot.n_predict}, // Server configured n_predict {"n_predict", slot.n_predict}, // Server configured n_predict
{"model", params.model_alias}, {"model", params.model_alias},
{"seed", slot.sparams.seed}, {"seed", slot.sparams.seed},
{"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0}, {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
{"temperature", slot.sparams.temp}, {"temperature", slot.sparams.temp},
{"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_range", slot.sparams.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
@ -1297,7 +1297,7 @@ struct server_context {
}; };
if (slot.sparams.n_probs > 0) { if (slot.sparams.n_probs > 0) {
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
@ -1347,7 +1347,7 @@ struct server_context {
if (slot.sparams.n_probs > 0) { if (slot.sparams.n_probs > 0) {
std::vector<completion_token_output> probs; std::vector<completion_token_output> probs;
if (!slot.params.stream && slot.stopped_word) { if (!slot.params.stream && slot.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
probs = std::vector<completion_token_output>( probs = std::vector<completion_token_output>(
@ -1401,7 +1401,7 @@ struct server_context {
continue; continue;
} }
llama_embd_normalize(embd, embd_res.data(), n_embd); common_embd_normalize(embd, embd_res.data(), n_embd);
res.data = json { res.data = json {
{"embedding", embd_res}, {"embedding", embd_res},
@ -1835,7 +1835,7 @@ struct server_context {
} break; } break;
case SERVER_TASK_TYPE_SET_LORA: case SERVER_TASK_TYPE_SET_LORA:
{ {
llama_lora_adapters_apply(ctx, loras); common_lora_adapters_apply(ctx, loras);
server_task_result result; server_task_result result;
result.id = task.id; result.id = task.id;
result.stop = true; result.stop = true;
@ -1921,7 +1921,7 @@ struct server_context {
} }
// start populating the batch for this iteration // start populating the batch for this iteration
llama_batch_clear(batch); common_batch_clear(batch);
// frist, add sampled tokens from any ongoing sequences // frist, add sampled tokens from any ongoing sequences
for (auto & slot : slots) { for (auto & slot : slots) {
@ -1935,7 +1935,7 @@ struct server_context {
// TODO: we always have to take into account the "system_tokens" // TODO: we always have to take into account the "system_tokens"
// this is not great and needs to be improved somehow // this is not great and needs to be improved somehow
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
slot.n_past += 1; slot.n_past += 1;
@ -2092,7 +2092,7 @@ struct server_context {
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
} }
gpt_sampler_reset(slot.smpl); common_sampler_reset(slot.smpl);
if (!slot.params.cache_prompt) { if (!slot.params.cache_prompt) {
slot.n_past_se = 0; slot.n_past_se = 0;
@ -2105,7 +2105,7 @@ struct server_context {
// push the prompt into the sampling context (do not apply grammar) // push the prompt into the sampling context (do not apply grammar)
for (int i = 0; i < slot.n_past; ++i) { for (int i = 0; i < slot.n_past; ++i) {
gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false); common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
} }
} }
} }
@ -2159,7 +2159,7 @@ struct server_context {
slot.n_past_se = 0; slot.n_past_se = 0;
slot.ga_i = 0; slot.ga_i = 0;
// TODO: is the system prompt ever in the sampling context? // TODO: is the system prompt ever in the sampling context?
gpt_sampler_reset(slot.smpl); common_sampler_reset(slot.smpl);
} }
// remove the non-common part from the cache // remove the non-common part from the cache
@ -2184,7 +2184,7 @@ struct server_context {
} }
} }
llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
if (slot.params.cache_prompt) { if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@ -2322,9 +2322,9 @@ struct server_context {
} }
completion_token_output result; completion_token_output result;
const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i); const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
gpt_sampler_accept(slot.smpl, id, true); common_sampler_accept(slot.smpl, id, true);
slot.n_decoded += 1; slot.n_decoded += 1;
if (slot.n_decoded == 1) { if (slot.n_decoded == 1) {
@ -2335,7 +2335,7 @@ struct server_context {
result.tok = id; result.tok = id;
const auto * cur_p = gpt_sampler_get_candidates(slot.smpl); const auto * cur_p = common_sampler_get_candidates(slot.smpl);
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
result.probs.push_back({ result.probs.push_back({
@ -2399,13 +2399,13 @@ inline void signal_handler(int signal) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
// own arguments required by this example // own arguments required by this example
gpt_params params; common_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
return 1; return 1;
} }
gpt_init(); common_init();
// enabling this will output extra debug information in the HTTP responses from the server // enabling this will output extra debug information in the HTTP responses from the server
// see format_final_response_oaicompat() // see format_final_response_oaicompat()
@ -2427,7 +2427,7 @@ int main(int argc, char ** argv) {
LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n"); LOG_INF("\n");
std::unique_ptr<httplib::Server> svr; std::unique_ptr<httplib::Server> svr;
@ -3014,7 +3014,7 @@ int main(int argc, char ** argv) {
if (with_pieces) { if (with_pieces) {
for (const auto& token : tokens) { for (const auto& token : tokens) {
std::string piece = llama_token_to_piece(ctx_server.ctx, token); std::string piece = common_token_to_piece(ctx_server.ctx, token);
json piece_json; json piece_json;
// Check if the piece is valid UTF-8 // Check if the piece is valid UTF-8
@ -3357,7 +3357,7 @@ int main(int argc, char ** argv) {
} }
// print sample chat example to make it clear which template is used // print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str()); LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
ctx_server.queue_tasks.on_new_task(std::bind( ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1)); &server_context::process_single_task, &ctx_server, std::placeholders::_1));

View file

@ -57,7 +57,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
// Format given chat. If tmpl is empty, we take the template from model metadata // Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) { inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
std::vector<llama_chat_msg> chat; std::vector<common_chat_msg> chat;
for (size_t i = 0; i < messages.size(); ++i) { for (size_t i = 0; i < messages.size(); ++i) {
const auto & curr_msg = messages[i]; const auto & curr_msg = messages[i];
@ -84,7 +84,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
chat.push_back({role, content}); chat.push_back({role, content});
} }
const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
return formatted_chat; return formatted_chat;
@ -246,7 +246,7 @@ template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
std::string ret; std::string ret;
for (; begin != end; ++begin) { for (; begin != end; ++begin) {
ret += llama_token_to_piece(ctx, *begin); ret += common_token_to_piece(ctx, *begin);
} }
return ret; return ret;
@ -254,7 +254,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
// format incomplete utf-8 multibyte character for output // format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
// if the size is 1 and first bit is 1, meaning it's a partial character // if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token) // (size > 1 meaning it's already a known token)

View file

@ -1,5 +1,5 @@
set(TARGET llama-simple) set(TARGET llama-simple)
add_executable(${TARGET} simple.cpp) add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,50 +1,112 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cstdio>
#include <cstring>
#include <string>
#include <vector> #include <vector>
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n"); printf("\nexample usage:\n");
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
LOG("\n"); printf("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; // path to the model gguf file
std::string model_path;
// prompt to generate text from
std::string prompt = "Hello my name is";
// number of layers to offload to the GPU
int ngl = 99;
// number of tokens to predict
int n_predict = 32;
params.prompt = "Hello my name is"; // parse command line arguments
params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { {
return 1; int i = 1;
for (; i < argc; i++) {
if (strcmp(argv[i], "-m") == 0) {
if (i + 1 < argc) {
model_path = argv[++i];
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-n") == 0) {
if (i + 1 < argc) {
try {
n_predict = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return 1;
}
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-ngl") == 0) {
if (i + 1 < argc) {
try {
ngl = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return 1;
}
} else {
print_usage(argc, argv);
return 1;
}
} else {
// prompt starts here
break;
}
}
if (model_path.empty()) {
print_usage(argc, argv);
return 1;
}
if (i < argc) {
prompt = argv[i++];
for (; i < argc; i++) {
prompt += " ";
prompt += argv[i];
}
}
} }
gpt_init();
// total length of the sequence including the prompt
const int n_predict = params.n_predict;
// init LLM
llama_backend_init();
llama_numa_init(params.numa);
// initialize the model // initialize the model
llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1; return 1;
} }
// tokenize the prompt
// find the number of tokens in the prompt
const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
// allocate space for the tokens and tokenize the prompt
std::vector<llama_token> prompt_tokens(n_prompt);
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
return 1;
}
// initialize the context // initialize the context
llama_context_params ctx_params = llama_context_params_from_gpt_params(params); llama_context_params ctx_params = llama_context_default_params();
// n_ctx is the context size
ctx_params.n_ctx = n_prompt + n_predict - 1;
// n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
ctx_params.n_batch = n_prompt;
// enable performance counters
ctx_params.no_perf = false;
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@ -53,117 +115,87 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
// initialize the sampler
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false; sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
LOG("\n");
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1;
}
// print the prompt token-by-token // print the prompt token-by-token
LOG("\n"); for (auto id : prompt_tokens) {
char buf[128];
for (auto id : tokens_list) { int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
LOG("%s", llama_token_to_piece(ctx, id).c_str()); if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1;
}
std::string s(buf, n);
printf("%s", s.c_str());
} }
// create a llama_batch with size 512 // prepare a batch for the prompt
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(512, 0, 1); llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size(), 0, 0);
// evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); i++) {
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) {
LOG("%s: llama_decode() failed\n", __func__);
return 1;
}
// main loop // main loop
int n_cur = batch.n_tokens;
int n_decode = 0;
const auto t_main_start = ggml_time_us(); const auto t_main_start = ggml_time_us();
int n_decode = 0;
llama_token new_token_id;
for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
n_pos += batch.n_tokens;
while (n_cur <= n_predict) {
// sample the next token // sample the next token
{ {
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1); new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation? // is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { if (llama_token_is_eog(model, new_token_id)) {
LOG("\n");
break; break;
} }
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); char buf[128];
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1;
}
std::string s(buf, n);
printf("%s", s.c_str());
fflush(stdout); fflush(stdout);
// prepare the next batch // prepare the next batch with the sampled token
llama_batch_clear(batch); batch = llama_batch_get_one(&new_token_id, 1, n_pos, 0);
// push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
n_decode += 1; n_decode += 1;
} }
n_cur += 1;
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
} }
LOG("\n"); printf("\n");
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG("\n"); fprintf(stderr, "\n");
llama_perf_sampler_print(smpl); llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
fprintf(stderr, "\n");
LOG("\n");
llama_batch_free(batch);
llama_sampler_free(smpl); llama_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free();
return 0; return 0;
} }

View file

@ -26,20 +26,20 @@ struct seq_draft {
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
std::vector<std::vector<llama_token_data>> dists; std::vector<std::vector<llama_token_data>> dists;
struct gpt_sampler * smpl = nullptr; struct common_sampler * smpl = nullptr;
}; };
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; common_params params;
// needed to get candidate probs even for temp <= 0.0 // needed to get candidate probs even for temp <= 0.0
params.sparams.n_probs = 128; params.sparams.n_probs = 128;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
return 1; return 1;
} }
gpt_init(); common_init();
if (params.model_draft.empty()) { if (params.model_draft.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__); LOG_ERR("%s: --model-draft is required\n", __func__);
@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
llama_context * ctx_dft = NULL; llama_context * ctx_dft = NULL;
// load the target model // load the target model
llama_init_result llama_init_tgt = llama_init_from_gpt_params(params); common_init_result llama_init_tgt = common_init_from_params(params);
model_tgt = llama_init_tgt.model; model_tgt = llama_init_tgt.model;
ctx_tgt = llama_init_tgt.context; ctx_tgt = llama_init_tgt.context;
@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
} }
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
llama_init_result llama_init_dft = llama_init_from_gpt_params(params); common_init_result llama_init_dft = common_init_from_params(params);
model_dft = llama_init_dft.model; model_dft = llama_init_dft.model;
ctx_dft = llama_init_dft.context; ctx_dft = llama_init_dft.context;
@ -124,8 +124,8 @@ int main(int argc, char ** argv) {
if (std::strcmp(token_text_tgt, token_text_dft) != 0) { if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__); LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i, LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
llama_token_to_piece(ctx_tgt, i).c_str(), common_token_to_piece(ctx_tgt, i).c_str(),
llama_token_to_piece(ctx_dft, i).c_str()); common_token_to_piece(ctx_dft, i).c_str());
return 1; return 1;
} }
} }
@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
// Tokenize the prompt // Tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); inp = common_tokenize(ctx_tgt, params.prompt, true, true);
const int max_context_size = llama_n_ctx(ctx_tgt); const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
LOG("\n\n"); LOG("\n\n");
for (auto id : inp) { for (auto id : inp) {
LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str()); LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
} }
const int n_input = inp.size(); const int n_input = inp.size();
@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
bool has_eos = false; bool has_eos = false;
// target model sampling context (reuse the llama_context's sampling instance) // target model sampling context (reuse the llama_context's sampling instance)
struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams); struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
struct llama_sampler * softmax = llama_sampler_init_softmax(); struct llama_sampler * softmax = llama_sampler_init_softmax();
@ -186,8 +186,8 @@ int main(int argc, char ** argv) {
std::vector<seq_draft> drafts(n_seq_dft); std::vector<seq_draft> drafts(n_seq_dft);
for (int s = 0; s < n_seq_dft; ++s) { for (int s = 0; s < n_seq_dft; ++s) {
// allocate gpt_sampler for each draft sequence // allocate llama_sampler for each draft sequence
drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams); drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
} }
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@ -229,9 +229,9 @@ int main(int argc, char ** argv) {
bool accept = false; bool accept = false;
if (params.sparams.temp > 0) { if (params.sparams.temp > 0) {
// stochastic verification // stochastic verification
gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
auto & dist_tgt = *gpt_sampler_get_candidates(smpl); auto & dist_tgt = *common_sampler_get_candidates(smpl);
float p_tgt = 0.0f; float p_tgt = 0.0f;
float p_dft = 0.0f; float p_dft = 0.0f;
@ -277,13 +277,13 @@ int main(int argc, char ** argv) {
s_keep = s; s_keep = s;
accept = true; accept = true;
token_id = drafts[s].tokens[i_dft]; token_id = drafts[s].tokens[i_dft];
token_str = llama_token_to_piece(ctx_tgt, token_id); token_str = common_token_to_piece(ctx_tgt, token_id);
gpt_sampler_accept(smpl, token_id, true); common_sampler_accept(smpl, token_id, true);
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
break; break;
} else { } else {
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
drafts[s].active = false; drafts[s].active = false;
// calculate residual probability // calculate residual probability
@ -349,19 +349,19 @@ int main(int argc, char ** argv) {
const int idx = dist(rng); const int idx = dist(rng);
token_id = dist_tgt.data[idx].id; token_id = dist_tgt.data[idx].id;
gpt_sampler_accept(smpl, token_id, true); common_sampler_accept(smpl, token_id, true);
token_str = llama_token_to_piece(ctx_tgt, token_id); token_str = common_token_to_piece(ctx_tgt, token_id);
} }
} else { } else {
// greedy verification // greedy verification
// sample from the target model // sample from the target model
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
gpt_sampler_accept(smpl, token_id, true); common_sampler_accept(smpl, token_id, true);
token_str = llama_token_to_piece(ctx_tgt, token_id); token_str = common_token_to_piece(ctx_tgt, token_id);
for (int s = 0; s < n_seq_dft; ++s) { for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) { if (!drafts[s].active) {
@ -431,8 +431,8 @@ int main(int argc, char ** argv) {
drafts[0].dists.push_back(std::vector<llama_token_data>()); drafts[0].dists.push_back(std::vector<llama_token_data>());
drafts[0].i_batch_tgt.push_back(0); drafts[0].i_batch_tgt.push_back(0);
llama_batch_clear(batch_dft); common_batch_clear(batch_dft);
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
@ -446,9 +446,9 @@ int main(int argc, char ** argv) {
} }
if (drafts[0].smpl) { if (drafts[0].smpl) {
gpt_sampler_free(drafts[0].smpl); common_sampler_free(drafts[0].smpl);
} }
drafts[0].smpl = gpt_sampler_clone(smpl); drafts[0].smpl = common_sampler_clone(smpl);
int n_seq_cur = 1; int n_seq_cur = 1;
int n_past_cur = n_past_dft; int n_past_cur = n_past_dft;
@ -461,8 +461,8 @@ int main(int argc, char ** argv) {
drafts[0].drafting = true; drafts[0].drafting = true;
drafts[0].i_batch_dft = 0; drafts[0].i_batch_dft = 0;
llama_batch_clear(batch_tgt); common_batch_clear(batch_tgt);
llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
// sample n_draft tokens from the draft model using tree-based sampling // sample n_draft tokens from the draft model using tree-based sampling
for (int i = 0; i < n_draft; ++i) { for (int i = 0; i < n_draft; ++i) {
@ -477,13 +477,13 @@ int main(int argc, char ** argv) {
continue; continue;
} }
gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
} }
std::vector<int> sa(1, s); std::vector<int> sa(1, s);
@ -518,9 +518,9 @@ int main(int argc, char ** argv) {
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
if (drafts[n_seq_cur].smpl) { if (drafts[n_seq_cur].smpl) {
gpt_sampler_free(drafts[n_seq_cur].smpl); common_sampler_free(drafts[n_seq_cur].smpl);
} }
drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl); drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl);
sa.push_back(n_seq_cur); sa.push_back(n_seq_cur);
@ -536,7 +536,7 @@ int main(int argc, char ** argv) {
const int s = sa[is]; const int s = sa[is];
gpt_sampler_accept(drafts[s].smpl, id, true); common_sampler_accept(drafts[s].smpl, id, true);
drafts[s].tokens.push_back(id); drafts[s].tokens.push_back(id);
// save cur_p.data into drafts[s].dists // save cur_p.data into drafts[s].dists
@ -545,12 +545,12 @@ int main(int argc, char ** argv) {
// add unique drafted tokens to the target batch // add unique drafted tokens to the target batch
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
// add the token to the batch for batched decoding with the draft model // add the token to the batch for batched decoding with the draft model
drafts[s].i_batch_dft = batch_dft.n_tokens; drafts[s].i_batch_dft = batch_dft.n_tokens;
llama_batch_add(batch_dft, id, n_past_cur, { s }, true); common_batch_add(batch_dft, id, n_past_cur, { s }, true);
if (batch_tgt.n_tokens > n_draft) { if (batch_tgt.n_tokens > n_draft) {
drafts[s].drafting = false; drafts[s].drafting = false;
@ -617,11 +617,11 @@ int main(int argc, char ** argv) {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("target:\n\n"); LOG_INF("target:\n\n");
gpt_perf_print(ctx_tgt, smpl); common_perf_print(ctx_tgt, smpl);
gpt_sampler_free(smpl); common_sampler_free(smpl);
for (int s = 0; s < n_seq_dft; ++s) { for (int s = 0; s < n_seq_dft; ++s) {
gpt_sampler_free(drafts[s].smpl); common_sampler_free(drafts[s].smpl);
} }
llama_sampler_free(softmax); llama_sampler_free(softmax);

View file

@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) {
const bool parse_special = !no_parse_special; const bool parse_special = !no_parse_special;
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); tokens = common_tokenize(model, prompt, add_bos, parse_special);
if (printing_ids) { if (printing_ids) {
printf("["); printf("[");
@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
} else { } else {
bool invalid_utf8 = false; bool invalid_utf8 = false;
printf("%6d -> '", tokens[i]); printf("%6d -> '", tokens[i]);
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
if (invalid_utf8) { if (invalid_utf8) {
printf("' (utf-8 decode failure)\n"); printf("' (utf-8 decode failure)\n");
} else { } else {

View file

@ -17,7 +17,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -163,8 +163,8 @@ if (GGML_OPENMP)
list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
if (GGML_MUSA) if (GGML_MUSA)
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp") list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so") list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
endif() endif()
else() else()
message(WARNING "OpenMP not found") message(WARNING "OpenMP not found")

View file

@ -542,6 +542,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
#include "ggml-blas.h" #include "ggml-blas.h"
#endif #endif
#ifdef GGML_USE_RPC
#include "ggml-rpc.h"
#endif
struct ggml_backend_registry { struct ggml_backend_registry {
std::vector<ggml_backend_reg_t> backends; std::vector<ggml_backend_reg_t> backends;
std::vector<ggml_backend_dev_t> devices; std::vector<ggml_backend_dev_t> devices;
@ -556,6 +560,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg()); register_backend(ggml_backend_blas_reg());
#endif #endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
// TODO: sycl, vulkan, kompute, cann // TODO: sycl, vulkan, kompute, cann

View file

@ -25,7 +25,7 @@
# include <netdb.h> # include <netdb.h>
# include <unistd.h> # include <unistd.h>
#endif #endif
#include <string.h> #include <cstring>
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
@ -630,22 +630,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
return (enum ggml_status)output[0]; return (enum ggml_status)output[0];
} }
static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
UNUSED(backend);
UNUSED(op);
//TODO: call the remote backend and cache the results
return true;
}
static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
return false;
}
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
return buft_ctx->endpoint == rpc_ctx->endpoint;
}
static ggml_backend_i ggml_backend_rpc_interface = { static ggml_backend_i ggml_backend_rpc_interface = {
/* .get_name = */ ggml_backend_rpc_name, /* .get_name = */ ggml_backend_rpc_name,
/* .free = */ ggml_backend_rpc_free, /* .free = */ ggml_backend_rpc_free,
@ -659,8 +643,8 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .graph_compute = */ ggml_backend_rpc_graph_compute,
/* .supports_op = */ ggml_backend_rpc_supports_op, /* .supports_op = */ NULL,
/* .supports_buft = */ ggml_backend_rpc_supports_buft, /* .supports_buft = */ NULL,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
@ -691,7 +675,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type { ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
/* .iface = */ ggml_backend_rpc_buffer_type_interface, /* .iface = */ ggml_backend_rpc_buffer_type_interface,
/* .device = */ nullptr, /* .device = */ ggml_backend_rpc_add_device(endpoint),
/* .context = */ buft_ctx /* .context = */ buft_ctx
}; };
buft_map[endpoint] = buft; buft_map[endpoint] = buft;
@ -707,7 +691,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
ggml_backend_t backend = new ggml_backend { ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_rpc_guid(), /* .guid = */ ggml_backend_rpc_guid(),
/* .interface = */ ggml_backend_rpc_interface, /* .interface = */ ggml_backend_rpc_interface,
/* .device = */ nullptr, /* .device = */ ggml_backend_rpc_add_device(endpoint),
/* .context = */ ctx /* .context = */ ctx
}; };
return backend; return backend;
@ -1189,7 +1173,7 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
} }
} }
void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
std::string host; std::string host;
int port; int port;
if (!parse_endpoint(endpoint, host, port)) { if (!parse_endpoint(endpoint, host, port)) {
@ -1226,3 +1210,179 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free
WSACleanup(); WSACleanup();
#endif #endif
} }
// device interface
struct ggml_backend_rpc_device_context {
std::string endpoint;
std::string name;
};
static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
return ctx->name.c_str();
}
static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
return ctx->name.c_str();
}
static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
// TODO: obtain value from the server
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
UNUSED(dev);
}
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_rpc_device_get_name(dev);
props->description = ggml_backend_rpc_device_get_description(dev);
props->type = ggml_backend_rpc_device_get_type(dev);
ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
return ggml_backend_rpc_init(ctx->endpoint.c_str());
UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
UNUSED(dev);
}
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
UNUSED(dev);
UNUSED(max_tensor_size);
}
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
UNUSED(dev);
UNUSED(op);
//TODO: call the remote backend and cache the results
return true;
}
static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
return false;
}
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
return buft_ctx->endpoint == dev_ctx->endpoint;
}
static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
/* .get_name = */ ggml_backend_rpc_device_get_name,
/* .get_description = */ ggml_backend_rpc_device_get_description,
/* .get_memory = */ ggml_backend_rpc_device_get_memory,
/* .get_type = */ ggml_backend_rpc_device_get_type,
/* .get_props = */ ggml_backend_rpc_device_get_props,
/* .init_backend = */ ggml_backend_rpc_device_init,
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// backend reg interface
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
return "RPC";
UNUSED(reg);
}
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
return 0;
UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
UNUSED(reg);
UNUSED(index);
}
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
return (void *)ggml_backend_rpc_add_device;
}
return NULL;
UNUSED(reg);
}
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
/* .get_name = */ ggml_backend_rpc_reg_get_name,
/* .get_device_count = */ ggml_backend_rpc_reg_get_device_count,
/* .get_device = */ ggml_backend_rpc_reg_get_device,
/* .get_proc_address = */ ggml_backend_rpc_get_proc_address,
};
ggml_backend_reg_t ggml_backend_rpc_reg(void) {
static struct ggml_backend_reg ggml_backend_rpc_reg = {
/* .iface = */ ggml_backend_rpc_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_rpc_reg;
}
ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (dev_map.find(endpoint) != dev_map.end()) {
return dev_map[endpoint];
}
ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context {
/* .endpoint = */ endpoint,
/* .name = */ "RPC[" + std::string(endpoint) + "]",
};
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_rpc_device_i,
/* .reg = */ ggml_backend_rpc_reg(),
/* .context = */ ctx,
};
dev_map[endpoint] = dev;
return dev;
}

View file

@ -433,6 +433,7 @@ extern "C" {
LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mmap (void);
LLAMA_API bool llama_supports_mlock (void); LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void); LLAMA_API bool llama_supports_gpu_offload(void);
LLAMA_API bool llama_supports_rpc (void);
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);

View file

@ -8,10 +8,6 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef GGML_USE_RPC
# include "ggml-rpc.h"
#endif
#if defined(GGML_USE_VULKAN) #if defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h" # include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
@ -3404,10 +3400,6 @@ struct llama_lora_adapter {
static int llama_get_device_count(const llama_model & model) { static int llama_get_device_count(const llama_model & model) {
int count = (int) model.devices.size(); int count = (int) model.devices.size();
#if defined(GGML_USE_RPC)
count += (int) model.rpc_servers.size();
#endif
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_SYCL)
count += ggml_backend_sycl_get_device_count(); count += ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
@ -3460,15 +3452,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) { static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_RPC)
int rpc_count = (int)model.rpc_servers.size();
if (device < rpc_count) {
const char * endpoint = model.rpc_servers[device].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
device -= rpc_count;
#endif
if (device < (int)model.devices.size()) { if (device < (int)model.devices.size()) {
return ggml_backend_dev_buffer_type(model.devices[device]); return ggml_backend_dev_buffer_type(model.devices[device]);
} }
@ -3523,18 +3506,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
} }
static size_t llama_get_device_memory(const llama_model & model, int device) { static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
int rpc_count = (int)model.rpc_servers.size();
if (device < rpc_count) {
size_t total;
size_t free;
const char * endpoint = model.rpc_servers[device].c_str();
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free;
}
device = device - rpc_count;
#endif
if (device < (int)model.devices.size()) { if (device < (int)model.devices.size()) {
ggml_backend_dev_t dev = model.devices[device]; ggml_backend_dev_t dev = model.devices[device];
size_t total; size_t total;
@ -19019,15 +18990,20 @@ bool llama_supports_mlock(void) {
bool llama_supports_gpu_offload(void) { bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_VULKAN) || \ #if defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true; return true;
#else #else
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr; ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
llama_supports_rpc();
#endif #endif
} }
bool llama_supports_rpc(void) {
return ggml_backend_reg_by_name("RPC") != nullptr;
}
void llama_backend_init(void) { void llama_backend_init(void) {
ggml_time_init(); ggml_time_init();
@ -19102,6 +19078,36 @@ struct llama_model * llama_load_model_from_file(
model->rpc_servers.push_back(servers); model->rpc_servers.push_back(servers);
} }
// add RPC devices
if (!model->rpc_servers.empty()) {
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
llama_free_model(model);
return nullptr;
}
// ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
llama_free_model(model);
return nullptr;
}
for (const std::string & server : model->rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
model->devices.push_back(dev);
} else {
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
llama_free_model(model);
return nullptr;
}
}
}
// create list of devices to use with this model // create list of devices to use with this model
// currently, we use all available devices // currently, we use all available devices
// TODO: rework API to give user more control over device selection // TODO: rework API to give user more control over device selection
@ -19128,7 +19134,7 @@ struct llama_model * llama_load_model_from_file(
} else if (status == -2) { } else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
} }
delete model; llama_free_model(model);
return nullptr; return nullptr;
} }
@ -19311,23 +19317,6 @@ struct llama_context * llama_new_context_with_model(
main_gpu -= (int)model->devices.size(); main_gpu -= (int)model->devices.size();
} }
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
if (main_gpu >= (int)model->rpc_servers.size()) {
main_gpu -= (int)model->rpc_servers.size();
}
#endif
#if defined(GGML_USE_VULKAN) #if defined(GGML_USE_VULKAN)
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);

View file

@ -10,12 +10,12 @@
#include <cassert> #include <cassert>
int main(void) { int main(void) {
gpt_params params; common_params params;
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
try { try {
auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex); auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
std::unordered_set<std::string> seen_args; std::unordered_set<std::string> seen_args;
std::unordered_set<std::string> seen_env_vars; std::unordered_set<std::string> seen_env_vars;
for (const auto & opt : ctx_arg.options) { for (const auto & opt : ctx_arg.options) {
@ -58,44 +58,44 @@ int main(void) {
// missing value // missing value
argv = {"binary_name", "-m"}; argv = {"binary_name", "-m"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// wrong value (int) // wrong value (int)
argv = {"binary_name", "-ngl", "hello"}; argv = {"binary_name", "-ngl", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// wrong value (enum) // wrong value (enum)
argv = {"binary_name", "-sm", "hello"}; argv = {"binary_name", "-sm", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// non-existence arg in specific example (--draft cannot be used outside llama-speculative) // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
argv = {"binary_name", "--draft", "123"}; argv = {"binary_name", "--draft", "123"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
printf("test-arg-parser: test valid usage\n\n"); printf("test-arg-parser: test valid usage\n\n");
argv = {"binary_name", "-m", "model_file.gguf"}; argv = {"binary_name", "-m", "model_file.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "model_file.gguf"); assert(params.model == "model_file.gguf");
argv = {"binary_name", "-t", "1234"}; argv = {"binary_name", "-t", "1234"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.cpuparams.n_threads == 1234); assert(params.cpuparams.n_threads == 1234);
argv = {"binary_name", "--verbose"}; argv = {"binary_name", "--verbose"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.verbosity > 1); assert(params.verbosity > 1);
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "abc.gguf"); assert(params.model == "abc.gguf");
assert(params.n_predict == 6789); assert(params.n_predict == 6789);
assert(params.n_batch == 9090); assert(params.n_batch == 9090);
// --draft cannot be used outside llama-speculative // --draft cannot be used outside llama-speculative
argv = {"binary_name", "--draft", "123"}; argv = {"binary_name", "--draft", "123"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
assert(params.n_draft == 123); assert(params.n_draft == 123);
// skip this part on windows, because setenv is not supported // skip this part on windows, because setenv is not supported
@ -106,12 +106,12 @@ int main(void) {
setenv("LLAMA_ARG_THREADS", "blah", true); setenv("LLAMA_ARG_THREADS", "blah", true);
argv = {"binary_name"}; argv = {"binary_name"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true); setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name"}; argv = {"binary_name"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "blah.gguf"); assert(params.model == "blah.gguf");
assert(params.cpuparams.n_threads == 1010); assert(params.cpuparams.n_threads == 1010);
@ -121,7 +121,7 @@ int main(void) {
setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true); setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name", "-m", "overwritten.gguf"}; argv = {"binary_name", "-m", "overwritten.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "overwritten.gguf"); assert(params.model == "overwritten.gguf");
assert(params.cpuparams.n_threads == 1010); assert(params.cpuparams.n_threads == 1010);
#endif // _WIN32 #endif // _WIN32

View file

@ -140,11 +140,11 @@ int main(void) {
// test llama_chat_format_single for system message // test llama_chat_format_single for system message
printf("\n\n=== llama_chat_format_single (system message) ===\n\n"); printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
std::vector<llama_chat_msg> chat2; std::vector<common_chat_msg> chat2;
llama_chat_msg sys_msg{"system", "You are a helpful assistant"}; common_chat_msg sys_msg{"system", "You are a helpful assistant"};
auto fmt_sys = [&](std::string tmpl) { auto fmt_sys = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false); auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str()); printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n"); printf("-------------------------\n");
return output; return output;
@ -160,10 +160,10 @@ int main(void) {
chat2.push_back({"system", "You are a helpful assistant"}); chat2.push_back({"system", "You are a helpful assistant"});
chat2.push_back({"user", "Hello"}); chat2.push_back({"user", "Hello"});
chat2.push_back({"assistant", "I am assistant"}); chat2.push_back({"assistant", "I am assistant"});
llama_chat_msg new_msg{"user", "How are you"}; common_chat_msg new_msg{"user", "How are you"};
auto fmt_single = [&](std::string tmpl) { auto fmt_single = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str()); printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n"); printf("-------------------------\n");
return output; return output;

View file

@ -24,8 +24,8 @@ int main() {
} }
if (rand () % 10 < 5) { if (rand () % 10 < 5) {
gpt_log_set_timestamps(gpt_log_main(), rand() % 2); common_log_set_timestamps(common_log_main(), rand() % 2);
gpt_log_set_prefix (gpt_log_main(), rand() % 2); common_log_set_prefix (common_log_main(), rand() % 2);
} }
} }
}); });

View file

@ -202,7 +202,7 @@ int main(int argc, char **argv) {
for (int i = 0; i < nthread; i++) { for (int i = 0; i < nthread; i++) {
threads[i] = std::thread([&, i]() { threads[i] = std::thread([&, i]() {
for (const auto & test_kv : k_tests) { for (const auto & test_kv : k_tests) {
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false); const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
// here only print the result of the first thread // here only print the result of the first thread
// because the other threads are running the same tests // because the other threads are running the same tests
@ -212,7 +212,7 @@ int main(int argc, char **argv) {
printf("\n"); printf("\n");
printf("src: '%s'\n", test_kv.first.c_str()); printf("src: '%s'\n", test_kv.first.c_str());
printf("res: '%s'\n", llama_detokenize(ctx, res).c_str()); printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
printf("tok: "); printf("tok: ");
for (const auto & tok : res) { for (const auto & tok : res) {
printf("%d ", tok); printf("%d ", tok);
@ -229,16 +229,16 @@ int main(int argc, char **argv) {
if (!correct) { if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
llama_detokenize(ctx, res).c_str(), common_detokenize(ctx, res).c_str(),
llama_detokenize(ctx, test_kv.second).c_str()); common_detokenize(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__); fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) { for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
} }
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__); fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) { for (const auto & t : res) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
} }
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -273,7 +273,7 @@ int main(int argc, char **argv) {
{ {
const auto t_start = ggml_time_us(); const auto t_start = ggml_time_us();
res = llama_tokenize(ctx, text, add_special, false); res = common_tokenize(ctx, text, add_special, false);
const auto t_end = ggml_time_us(); const auto t_end = ggml_time_us();

View file

@ -78,10 +78,10 @@ int main(int argc, char **argv) {
const int n_vocab = llama_n_vocab(model); const int n_vocab = llama_n_vocab(model);
for (int i = 0; i < n_vocab; ++i) { for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize(ctx, std::vector<int>(1, i)); std::string str = common_detokenize(ctx, std::vector<int>(1, i));
try { try {
auto cps = unicode_cpts_from_utf8(str); auto cps = unicode_cpts_from_utf8(str);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true); std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
if (ignore_merges && tokens.size() > 1) { if (ignore_merges && tokens.size() > 1) {
fprintf(stderr, fprintf(stderr,
"%s : error: token %d detokenizes to '%s'(%zu) but " "%s : error: token %d detokenizes to '%s'(%zu) but "
@ -94,7 +94,7 @@ int main(int argc, char **argv) {
fprintf(stderr, "]\n"); fprintf(stderr, "]\n");
return 2; return 2;
} }
std::string check = llama_detokenize(ctx, tokens); std::string check = common_detokenize(ctx, tokens);
if (check != str) { if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n", fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length()); __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@ -123,8 +123,8 @@ int main(int argc, char **argv) {
} }
std::string str = unicode_cpt_to_utf8(cp); std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
std::string check = llama_detokenize(ctx, tokens); std::string check = common_detokenize(ctx, tokens);
if (cp != 9601 && str != check) { if (cp != 9601 && str != check) {
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
cp, check.c_str(), check.length(), str.c_str(), str.length()); cp, check.c_str(), check.length(), str.c_str(), str.length());

View file

@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
const int n_vocab = llama_n_vocab(model); const int n_vocab = llama_n_vocab(model);
for (int i = 0; i < n_vocab; ++i) { for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true); std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true); std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
std::string check = llama_detokenize(ctx, tokens); std::string check = common_detokenize(ctx, tokens);
if (check != str) { if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n", fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length()); __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
} }
std::string str = unicode_cpt_to_utf8(cp); std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true); std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
std::string check = llama_detokenize(ctx, tokens); std::string check = common_detokenize(ctx, tokens);
if (cp != 9601 && str != check) { if (cp != 9601 && str != check) {
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
cp, check.c_str(), check.length(), str.c_str(), str.length()); cp, check.c_str(), check.length(), str.c_str(), str.length());